mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-15 12:55:51 +00:00
Merge remote-tracking branch 'amDosion/feat/computer-use-windows'
This commit is contained in:
70
DEV-LOG.md
70
DEV-LOG.md
@@ -1,5 +1,75 @@
|
||||
# DEV-LOG
|
||||
|
||||
## Computer Use Windows 增强:窗口绑定截图 + UI Automation + OCR (2026-04-03)
|
||||
|
||||
在三平台基础实现之上,利用 Windows 原生 API 增强 Computer Use 的 Windows 专属能力。
|
||||
|
||||
**新增文件:**
|
||||
|
||||
| 文件 | 行数 | 说明 |
|
||||
|------|------|------|
|
||||
| `src/utils/computerUse/win32/windowCapture.ts` | — | `PrintWindow` 窗口绑定截图,支持被遮挡/后台窗口 |
|
||||
| `src/utils/computerUse/win32/windowEnum.ts` | — | `EnumWindows` 精确窗口枚举(HWND + PID + 标题) |
|
||||
| `src/utils/computerUse/win32/uiAutomation.ts` | — | `IUIAutomation` UI 元素树读取、按钮点击、文本写入、坐标识别 |
|
||||
| `src/utils/computerUse/win32/ocr.ts` | — | `Windows.Media.Ocr` 截图+文字识别(英语+中文) |
|
||||
|
||||
**修改文件:**
|
||||
|
||||
| 文件 | 变更 |
|
||||
|------|------|
|
||||
| `packages/@ant/computer-use-swift/src/backends/win32.ts` | `listRunning` 改用 EnumWindows;新增 `captureWindowTarget` 窗口级截图 |
|
||||
|
||||
**验证结果(Windows x64):**
|
||||
- 窗口枚举:38 个可见窗口 ✅
|
||||
- 窗口截图:VS Code 2575x1415, 444KB ✅(PrintWindow, 即使被遮挡)
|
||||
- UI Automation:坐标元素识别 ✅
|
||||
- OCR:识别 VS Code 界面文字,34 行 ✅
|
||||
|
||||
---
|
||||
|
||||
## Enable Computer Use — macOS + Windows + Linux (2026-04-03)
|
||||
|
||||
恢复 Computer Use 屏幕操控功能。参考项目仅 macOS,本次扩展为三平台支持。
|
||||
|
||||
**Phase 1 — MCP server stub 替换:**
|
||||
从参考项目复制 `@ant/computer-use-mcp` 完整实现(12 文件,6517 行)。
|
||||
|
||||
**Phase 2 — 移除 src/ 中 8 处 macOS 硬编码:**
|
||||
|
||||
| 文件 | 改动 |
|
||||
|------|------|
|
||||
| `src/main.tsx:1605` | 去掉 `getPlatform() === 'macos'` |
|
||||
| `src/utils/computerUse/swiftLoader.ts` | 移除 darwin-only throw |
|
||||
| `src/utils/computerUse/executor.ts` | 平台守卫扩展为 darwin+win32+linux;剪贴板按平台分发(pbcopy→PowerShell→xclip);paste 快捷键 command→ctrl |
|
||||
| `src/utils/computerUse/drainRunLoop.ts` | 非 darwin 直接执行 fn() |
|
||||
| `src/utils/computerUse/escHotkey.ts` | 非 darwin 返回 false(Ctrl+C fallback) |
|
||||
| `src/utils/computerUse/hostAdapter.ts` | 非 darwin 权限检查返回 granted |
|
||||
| `src/utils/computerUse/common.ts` | platform + screenshotFiltering 动态化 |
|
||||
| `src/utils/computerUse/gates.ts` | enabled:true + hasRequiredSubscription→true |
|
||||
|
||||
**Phase 3 — input/swift 包 dispatcher + backends 三平台架构:**
|
||||
|
||||
```
|
||||
packages/@ant/computer-use-{input,swift}/src/
|
||||
├── index.ts ← dispatcher
|
||||
├── types.ts ← 共享接口
|
||||
└── backends/
|
||||
├── darwin.ts ← macOS AppleScript(原样拆出,不改逻辑)
|
||||
├── win32.ts ← Windows PowerShell
|
||||
└── linux.ts ← Linux xdotool/scrot/xrandr/wmctrl
|
||||
```
|
||||
|
||||
**编译开关:** `CHICAGO_MCP` 加入 DEFAULT_FEATURES + DEFAULT_BUILD_FEATURES
|
||||
|
||||
**验证结果(Windows x64):**
|
||||
- `isSupported: true` ✅
|
||||
- 鼠标定位 + 前台窗口信息 ✅
|
||||
- 双显示器检测 2560x1440 × 2 ✅
|
||||
- 全屏截图 3MB base64 ✅
|
||||
- `bun run build` 463 files ✅
|
||||
|
||||
---
|
||||
|
||||
## Enable Voice Mode / VOICE_MODE (2026-04-03)
|
||||
|
||||
恢复 `/voice` 语音输入功能。`src/` 下所有 voice 相关源码已与官方一致(0 行差异),问题出在:① `VOICE_MODE` 编译开关未开,命令不显示;② `audio-capture-napi` 是 SoX 子进程 stub(Windows 不支持),缺少官方原生 `.node` 二进制。
|
||||
|
||||
2
build.ts
2
build.ts
@@ -10,7 +10,7 @@ rmSync(outdir, { recursive: true, force: true });
|
||||
|
||||
// Default features that match the official CLI build.
|
||||
// Additional features can be enabled via FEATURE_<NAME>=1 env vars.
|
||||
const DEFAULT_BUILD_FEATURES = ["AGENT_TRIGGERS_REMOTE", "VOICE_MODE"];
|
||||
const DEFAULT_BUILD_FEATURES = ["AGENT_TRIGGERS_REMOTE", "CHICAGO_MCP", "VOICE_MODE"];
|
||||
|
||||
// Collect FEATURE_* env vars → Bun.build features
|
||||
const envFeatures = Object.keys(process.env)
|
||||
|
||||
315
docs/features/computer-use-windows-enhancement.md
Normal file
315
docs/features/computer-use-windows-enhancement.md
Normal file
@@ -0,0 +1,315 @@
|
||||
# Computer Use Windows 增强实施计划
|
||||
|
||||
更新时间:2026-04-03
|
||||
依赖文档:`docs/features/windows-ai-desktop-control.md`、`docs/features/computer-use.md`
|
||||
|
||||
## 1. 目标
|
||||
|
||||
在已有的 PowerShell 子进程方案基础上,利用 Windows 原生 API 增强 Computer Use 的 Windows 实现,解决 3 个核心问题:
|
||||
|
||||
1. **窗口绑定截图**:当前 `CopyFromScreen` 只能全屏截图,无法对指定窗口截图(尤其是被遮挡/最小化窗口)
|
||||
2. **UI 结构感知**:当前只能通过坐标点击,无法像 macOS Accessibility 那样理解 UI 元素树
|
||||
3. **性能**:每次 PowerShell 启动约 273ms,剪贴板/窗口枚举等高频操作需要更快的方式
|
||||
|
||||
## 2. 已验证的 Windows API 能力
|
||||
|
||||
以下 API 全部通过 PowerShell P/Invoke 实测通过:
|
||||
|
||||
| 能力 | API | 验证结果 |
|
||||
|------|-----|---------|
|
||||
| 窗口绑定截图 | `PrintWindow(hwnd, hdc, PW_RENDERFULLCONTENT)` | ✅ VS Code 342KB, Chrome 273KB |
|
||||
| 枚举窗口+HWND | `EnumWindows` + `GetWindowText` + `GetWindowThreadProcessId` | ✅ 38 个窗口,含 HWND/PID/标题 |
|
||||
| UI 元素树 | `System.Windows.Automation.AutomationElement` | ✅ 记事本 39 个元素 |
|
||||
| UI 写值 | `ValuePattern.SetValue()` | ✅ 成功写入记事本文本 |
|
||||
| UI 点击 | `InvokePattern.Invoke()` | ✅ 按钮可程序化点击 |
|
||||
| 坐标元素识别 | `AutomationElement.FromPoint(x, y)` | ✅ 返回元素类型+名称 |
|
||||
| OCR | `Windows.Media.Ocr.OcrEngine` | ✅ 英语+中文引擎可用 |
|
||||
| 全局热键 | `RegisterHotKey` | ✅ API 可调 |
|
||||
| 剪贴板直接操作 | `System.Windows.Forms.Clipboard` | ✅ 读/写/图片检测 |
|
||||
| Shell 启动 | `ShellExecute` | ✅ 打开文件/URL/应用 |
|
||||
|
||||
## 3. 架构设计
|
||||
|
||||
### 3.1 文件结构
|
||||
|
||||
在现有 `backends/win32.ts` 基础上新增 Windows 专属模块:
|
||||
|
||||
```
|
||||
packages/@ant/computer-use-input/src/
|
||||
├── backends/
|
||||
│ ├── darwin.ts ← 不动
|
||||
│ ├── win32.ts ← 增强:直接 Win32 API 替代部分 PowerShell
|
||||
│ └── linux.ts ← 不动
|
||||
|
||||
packages/@ant/computer-use-swift/src/
|
||||
├── backends/
|
||||
│ ├── darwin.ts ← 不动
|
||||
│ ├── win32.ts ← 增强:PrintWindow 窗口截图 + EnumWindows
|
||||
│ └── linux.ts ← 不动
|
||||
|
||||
packages/@ant/computer-use-mcp/src/
|
||||
│ └── tools.ts ← 增加 Windows 专属工具定义(UI Automation、OCR)
|
||||
|
||||
src/utils/computerUse/
|
||||
│ └── win32/ ← 新增目录:Windows 专属能力
|
||||
│ ├── uiAutomation.ts ← UI 元素树、点击、写值
|
||||
│ ├── ocr.ts ← 截图 + OCR 文字识别
|
||||
│ ├── windowCapture.ts ← PrintWindow 窗口绑定截图
|
||||
│ └── windowEnum.ts ← EnumWindows 窗口枚举
|
||||
```
|
||||
|
||||
### 3.2 分层
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────┐
|
||||
│ Computer Use MCP Tools │
|
||||
│ screenshot / click / type / request_access │
|
||||
│ + Windows 专属: ui_tree / ocr / window_cap │
|
||||
├──────────────────────────────────────────────┤
|
||||
│ src/utils/computerUse/ │
|
||||
│ executor.ts → 按平台 dispatch │
|
||||
│ win32/ → Windows 专属能力模块 │
|
||||
├──────────────────────────────────────────────┤
|
||||
│ packages/@ant/computer-use-{input,swift} │
|
||||
│ backends/win32.ts → PowerShell + Win32 API │
|
||||
├──────────────────────────────────────────────┤
|
||||
│ Windows Native API │
|
||||
│ PrintWindow / EnumWindows / UI Automation │
|
||||
│ SendInput / Clipboard / OCR / ShellExecute │
|
||||
└──────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
## 4. 实施计划
|
||||
|
||||
### Phase A:窗口绑定截图(解决核心问题)
|
||||
|
||||
**问题**:当前 `CopyFromScreen` 只能全屏截图,无法对指定窗口截图。
|
||||
**方案**:用 `PrintWindow` + `FindWindow` 实现窗口级截图。
|
||||
|
||||
| 步骤 | 文件 | 改动 |
|
||||
|------|------|------|
|
||||
| A.1 | `src/utils/computerUse/win32/windowCapture.ts` | 新建:`captureWindow(title)` 用 PrintWindow 截取指定窗口 |
|
||||
| A.2 | `src/utils/computerUse/win32/windowEnum.ts` | 新建:`listWindows()` 用 EnumWindows 返回 {hwnd, pid, title}[] |
|
||||
| A.3 | `packages/@ant/computer-use-swift/src/backends/win32.ts` | `screenshot.captureExcluding` 增加按窗口截图能力 |
|
||||
| A.4 | `packages/@ant/computer-use-swift/src/backends/win32.ts` | `apps.listRunning` 用 EnumWindows 替代 Get-Process(返回 HWND) |
|
||||
|
||||
**PowerShell 脚本核心**:
|
||||
|
||||
```powershell
|
||||
# PrintWindow 截取指定窗口
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
Add-Type -ReferencedAssemblies System.Drawing @'
|
||||
using System; using System.Runtime.InteropServices; using System.Drawing; using System.Drawing.Imaging;
|
||||
public class WinCap {
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||
public static extern IntPtr FindWindow(string c, string t);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f);
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct RECT { public int L, T, R, B; }
|
||||
// ... CaptureByTitle(string title) → base64
|
||||
}
|
||||
'@
|
||||
```
|
||||
|
||||
**验证标准**:
|
||||
- 能按窗口标题截图
|
||||
- 被遮挡的窗口也能截图
|
||||
- 返回 base64 + width + height
|
||||
|
||||
### Phase B:UI Automation(Windows 专属新能力)
|
||||
|
||||
**问题**:macOS 有 Accessibility API 可以读取/操作 UI 元素,Windows 当前只能坐标点击。
|
||||
**方案**:用 `System.Windows.Automation` 实现 UI 树读取和元素操作。
|
||||
|
||||
| 步骤 | 文件 | 改动 |
|
||||
|------|------|------|
|
||||
| B.1 | `src/utils/computerUse/win32/uiAutomation.ts` | 新建:核心 UIA 操作封装 |
|
||||
| B.2 | `packages/@ant/computer-use-mcp/src/tools.ts` | 增加 Windows 专属工具定义 |
|
||||
|
||||
**uiAutomation.ts 导出函数**:
|
||||
|
||||
```typescript
|
||||
// 获取窗口的 UI 元素树
|
||||
getUITree(windowTitle: string, depth: number): UIElement[]
|
||||
|
||||
// 按名称/类型/AutomationId 查找元素
|
||||
findElement(windowTitle: string, query: {name?, controlType?, automationId?}): UIElement | null
|
||||
|
||||
// 点击元素(InvokePattern)
|
||||
clickElement(windowTitle: string, automationId: string): boolean
|
||||
|
||||
// 设置元素值(ValuePattern)
|
||||
setValue(windowTitle: string, automationId: string, value: string): boolean
|
||||
|
||||
// 获取坐标处的元素
|
||||
elementAtPoint(x: number, y: number): UIElement | null
|
||||
```
|
||||
|
||||
**UIElement 类型**:
|
||||
```typescript
|
||||
interface UIElement {
|
||||
name: string
|
||||
controlType: string // Button, Edit, Text, List, etc.
|
||||
automationId: string
|
||||
boundingRect: { x: number, y: number, w: number, h: number }
|
||||
isEnabled: boolean
|
||||
value?: string // ValuePattern 可用时
|
||||
children?: UIElement[]
|
||||
}
|
||||
```
|
||||
|
||||
**PowerShell 脚本核心**:
|
||||
```powershell
|
||||
Add-Type -AssemblyName UIAutomationClient
|
||||
Add-Type -AssemblyName UIAutomationTypes
|
||||
|
||||
# 读取 UI 树
|
||||
$root = [AutomationElement]::RootElement
|
||||
$window = $root.FindFirst([TreeScope]::Children,
|
||||
[PropertyCondition]::new([AutomationElement]::NameProperty, $title))
|
||||
$elements = $window.FindAll([TreeScope]::Descendants, [Condition]::TrueCondition)
|
||||
|
||||
# 写入文本
|
||||
$element.GetCurrentPattern([ValuePattern]::Pattern).SetValue($text)
|
||||
|
||||
# 点击按钮
|
||||
$element.GetCurrentPattern([InvokePattern]::Pattern).Invoke()
|
||||
```
|
||||
|
||||
**验证标准**:
|
||||
- 能读取记事本的 UI 树(按钮、文本框、菜单)
|
||||
- 能向文本框写入内容
|
||||
- 能点击按钮
|
||||
- 能识别坐标处的元素
|
||||
|
||||
### Phase C:OCR 屏幕文字识别
|
||||
|
||||
**问题**:截图后 AI 只能看到图片,无法直接读取文字。
|
||||
**方案**:用 `Windows.Media.Ocr` 对截图进行文字识别。
|
||||
|
||||
| 步骤 | 文件 | 改动 |
|
||||
|------|------|------|
|
||||
| C.1 | `src/utils/computerUse/win32/ocr.ts` | 新建:截图 + OCR 识别 |
|
||||
| C.2 | `packages/@ant/computer-use-mcp/src/tools.ts` | 增加 `screen_ocr` 工具定义 |
|
||||
|
||||
**ocr.ts 导出函数**:
|
||||
```typescript
|
||||
// 对屏幕区域 OCR
|
||||
ocrRegion(x: number, y: number, w: number, h: number, lang?: string): OcrResult
|
||||
|
||||
// 对指定窗口 OCR
|
||||
ocrWindow(windowTitle: string, lang?: string): OcrResult
|
||||
|
||||
interface OcrResult {
|
||||
text: string
|
||||
lines: { text: string, bounds: {x,y,w,h} }[]
|
||||
language: string
|
||||
}
|
||||
```
|
||||
|
||||
**已确认可用语言**:英语 (en-US) + 中文 (zh-Hans-CN)
|
||||
|
||||
**验证标准**:
|
||||
- 能识别屏幕区域中的英文和中文
|
||||
- 返回文字内容 + 每行的位置信息
|
||||
|
||||
### Phase D:高频操作性能优化
|
||||
|
||||
**问题**:每次 PowerShell 启动 273ms,鼠标移动等高频操作太慢。
|
||||
**方案**:用 .NET `System.Windows.Forms.Clipboard` 等直接 API 替代 PowerShell 子进程。
|
||||
|
||||
| 步骤 | 文件 | 改动 |
|
||||
|------|------|------|
|
||||
| D.1 | `src/utils/computerUse/executor.ts` | 剪贴板操作用直接 API 替代 PowerShell |
|
||||
| D.2 | 考虑驻留 PowerShell 进程 | 通过 stdin/stdout 交互,摊平启动成本 |
|
||||
|
||||
**剪贴板直接 API**(不需要 PowerShell 子进程):
|
||||
```powershell
|
||||
# 读:50ms → <1ms
|
||||
[System.Windows.Forms.Clipboard]::GetText()
|
||||
|
||||
# 写:50ms → <1ms
|
||||
[System.Windows.Forms.Clipboard]::SetText($text)
|
||||
|
||||
# 图片检测
|
||||
[System.Windows.Forms.Clipboard]::ContainsImage()
|
||||
```
|
||||
|
||||
### Phase E:`request_access` Windows 适配
|
||||
|
||||
**问题**:`request_access` 依赖 macOS bundleId 识别应用,Windows 没有这个概念。
|
||||
**方案**:在 Windows 上用 exe 路径 + 窗口标题替代 bundleId。
|
||||
|
||||
| 步骤 | 文件 | 改动 |
|
||||
|------|------|------|
|
||||
| E.1 | `packages/@ant/computer-use-mcp/src/toolCalls.ts` | `resolveRequestedApps` 在 Windows 上用 exe 路径匹配 |
|
||||
| E.2 | `packages/@ant/computer-use-mcp/src/sentinelApps.ts` | 增加 Windows 危险应用列表(cmd.exe, powershell.exe 等) |
|
||||
| E.3 | `packages/@ant/computer-use-mcp/src/deniedApps.ts` | 增加 Windows 浏览器/终端识别规则 |
|
||||
| E.4 | `src/utils/computerUse/hostAdapter.ts` | `ensureOsPermissions` Windows 上检查 UAC 状态 |
|
||||
|
||||
**Windows 应用标识映射**:
|
||||
```
|
||||
macOS bundleId → Windows 等价
|
||||
com.apple.Safari → C:\Program Files\...\msedge.exe(或窗口标题匹配)
|
||||
com.google.Chrome → chrome.exe
|
||||
com.apple.Terminal → WindowsTerminal.exe / cmd.exe
|
||||
```
|
||||
|
||||
### Phase F:全局热键(ESC 拦截)
|
||||
|
||||
**问题**:当前非 darwin 直接跳过 ESC 热键,用 Ctrl+C 替代。
|
||||
**方案**:用 `RegisterHotKey` 或 `SetWindowsHookEx(WH_KEYBOARD_LL)` 实现。
|
||||
|
||||
| 步骤 | 文件 | 改动 |
|
||||
|------|------|------|
|
||||
| F.1 | `src/utils/computerUse/escHotkey.ts` | Windows 分支:RegisterHotKey 注册 ESC |
|
||||
|
||||
**优先级低**——当前 Ctrl+C fallback 可用,ESC 热键是体验优化。
|
||||
|
||||
## 5. 执行优先级
|
||||
|
||||
```
|
||||
Phase A: 窗口绑定截图 ← P0 核心需求,解决"操作其他界面"
|
||||
Phase B: UI Automation ← P0 核心能力,AI 理解 UI 结构
|
||||
Phase C: OCR ← P1 增值能力,AI 读屏幕文字
|
||||
Phase D: 性能优化 ← P1 体验优化,高频操作提速
|
||||
Phase E: request_access 适配 ← P1 功能完整性,权限模型适配
|
||||
Phase F: ESC 热键 ← P2 体验优化,可后做
|
||||
```
|
||||
|
||||
## 6. 每个 Phase 的改动量估算
|
||||
|
||||
| Phase | 新增文件 | 修改文件 | 新增代码行 | 风险 |
|
||||
|-------|---------|---------|-----------|------|
|
||||
| A 窗口截图 | 2 | 1 | ~200 | 低 |
|
||||
| B UI Automation | 1 | 1 | ~300 | 中 |
|
||||
| C OCR | 1 | 1 | ~150 | 低 |
|
||||
| D 性能优化 | 0 | 2 | ~50 | 低 |
|
||||
| E request_access | 0 | 3 | ~100 | 中 |
|
||||
| F ESC 热键 | 0 | 1 | ~50 | 低 |
|
||||
| **总计** | **4** | **9** | **~850** | — |
|
||||
|
||||
## 7. 不动的文件
|
||||
|
||||
- `backends/darwin.ts`(两个包都不动)
|
||||
- `backends/linux.ts`(两个包都不动)
|
||||
- `src/utils/computerUse/` 中 macOS 相关代码路径不动
|
||||
- `packages/@ant/computer-use-mcp/src/` 中已复制的参考项目代码不动(只追加 Windows 工具)
|
||||
|
||||
## 8. 与 macOS/Linux 方案的对比
|
||||
|
||||
| 能力 | macOS | Windows (增强后) | Linux |
|
||||
|------|-------|-----------------|-------|
|
||||
| 截图方式 | SCContentFilter (per-app) | **PrintWindow (per-window)** | scrot (全屏/区域) |
|
||||
| UI 结构 | Accessibility API | **UI Automation** | 无 |
|
||||
| OCR | 无内置 | **Windows.Media.Ocr** | 无内置 |
|
||||
| 键鼠 | CGEvent + enigo | SendInput + keybd_event | xdotool |
|
||||
| 窗口管理 | NSWorkspace | **EnumWindows + Win32** | wmctrl |
|
||||
| 剪贴板 | pbcopy/pbpaste | **Clipboard 直接 API** | xclip |
|
||||
| ESC 热键 | CGEventTap | RegisterHotKey | 无 |
|
||||
| 应用标识 | bundleId | exe 路径 + 窗口标题 | /proc + wmctrl |
|
||||
|
||||
**Windows 增强后将在 UI Automation 和 OCR 方面超过 macOS 方案**——这两项 macOS 原始实现也没有(Anthropic 用的是截图 + Claude 视觉理解,没有结构化 UI 数据)。
|
||||
197
docs/features/computer-use.md
Normal file
197
docs/features/computer-use.md
Normal file
@@ -0,0 +1,197 @@
|
||||
# Computer Use — macOS / Windows / Linux 跨平台实施计划
|
||||
|
||||
更新时间:2026-04-03
|
||||
参考项目:`E:\源码\claude-code-source-main\claude-code-source-main`
|
||||
|
||||
## 1. 现状
|
||||
|
||||
参考项目的 Computer Use **仅支持 macOS**——从入口到底层全部写死 darwin。我们的项目在 Phase 1-3 中已经完成了:
|
||||
|
||||
- ✅ `@ant/computer-use-mcp` stub 替换为完整实现(12 文件)
|
||||
- ✅ `@ant/computer-use-input` 拆为 dispatcher + backends(darwin + win32)
|
||||
- ✅ `@ant/computer-use-swift` 拆为 dispatcher + backends(darwin + win32)
|
||||
- ✅ `CHICAGO_MCP` 编译开关已开
|
||||
- ❌ `src/` 层有 6 处 macOS 硬编码阻塞
|
||||
|
||||
## 2. 阻塞点全景
|
||||
|
||||
### 2.1 入口层
|
||||
|
||||
| # | 文件:行号 | 阻塞代码 | 影响 |
|
||||
|---|----------|---------|------|
|
||||
| 1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` | 整个 CU 初始化被跳过 |
|
||||
|
||||
### 2.2 加载层
|
||||
|
||||
| # | 文件:行号 | 阻塞代码 | 影响 |
|
||||
|---|----------|---------|------|
|
||||
| 2 | `src/utils/computerUse/swiftLoader.ts:16` | `process.platform !== 'darwin'` → throw | 截图、应用管理全部不可用 |
|
||||
| 3 | `src/utils/computerUse/executor.ts:263` | `process.platform !== 'darwin'` → throw | 整个 executor 工厂函数不可用 |
|
||||
|
||||
### 2.3 macOS 特有依赖
|
||||
|
||||
| # | 文件:行号 | 依赖 | macOS 实现 | 需要替代方案 |
|
||||
|---|----------|------|-----------|------------|
|
||||
| 4 | `executor.ts:70-88` | 剪贴板 | `pbcopy`/`pbpaste` | Win: PowerShell `Get/Set-Clipboard`;Linux: `xclip`/`wl-copy` |
|
||||
| 5 | `drainRunLoop.ts:21` | CFRunLoop pump | `cu._drainMainRunLoop()` | 非 darwin:直接执行 fn(),不需要 pump |
|
||||
| 6 | `escHotkey.ts:28` | ESC 热键 | CGEventTap | 非 darwin:返回 false(已有 Ctrl+C fallback) |
|
||||
| 7 | `hostAdapter.ts:48-54` | 系统权限 | TCC accessibility + screenRecording | Win:直接 granted;Linux:检查 xdotool |
|
||||
| 8 | `common.ts:56` | 平台标识 | `platform: 'darwin'` 硬编码 | 动态获取 |
|
||||
| 9 | `executor.ts:180` | 粘贴快捷键 | `command+v` | Win/Linux:`ctrl+v` |
|
||||
|
||||
### 2.4 缺失的 Linux 后端
|
||||
|
||||
| 包 | macOS | Windows | Linux |
|
||||
|---|-------|---------|-------|
|
||||
| `computer-use-input/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts |
|
||||
| `computer-use-swift/backends/` | ✅ darwin.ts | ✅ win32.ts | ❌ 需新建 linux.ts |
|
||||
|
||||
## 3. 每个平台的能力依赖
|
||||
|
||||
### 3.1 computer-use-input(键鼠)
|
||||
|
||||
| 功能 | macOS | Windows | Linux |
|
||||
|------|-------|---------|-------|
|
||||
| 鼠标移动 | CGEvent JXA | SetCursorPos P/Invoke | xdotool mousemove |
|
||||
| 鼠标点击 | CGEvent JXA | SendInput P/Invoke | xdotool click |
|
||||
| 鼠标滚轮 | CGEvent JXA | SendInput MOUSEEVENTF_WHEEL | xdotool scroll |
|
||||
| 键盘按键 | System Events osascript | keybd_event P/Invoke | xdotool key |
|
||||
| 组合键 | System Events osascript | keybd_event 组合 | xdotool key combo |
|
||||
| 文本输入 | System Events keystroke | SendKeys.SendWait | xdotool type |
|
||||
| 前台应用 | System Events osascript | GetForegroundWindow P/Invoke | xdotool getactivewindow + /proc |
|
||||
| 工具依赖 | osascript(内置) | powershell(内置) | xdotool(需安装) |
|
||||
|
||||
### 3.2 computer-use-swift(截图 + 应用管理)
|
||||
|
||||
| 功能 | macOS | Windows | Linux |
|
||||
|------|-------|---------|-------|
|
||||
| 全屏截图 | screencapture | CopyFromScreen | gnome-screenshot / scrot / grim |
|
||||
| 区域截图 | screencapture -R | CopyFromScreen(rect) | gnome-screenshot -a / scrot -a / grim -g |
|
||||
| 显示器列表 | CGGetActiveDisplayList JXA | Screen.AllScreens | xrandr --query |
|
||||
| 运行中应用 | System Events JXA | Get-Process | wmctrl -l / ps |
|
||||
| 打开应用 | osascript activate | Start-Process | xdg-open / gtk-launch |
|
||||
| 隐藏/显示 | System Events visibility | ShowWindow/SetForegroundWindow | wmctrl -c / xdotool |
|
||||
| 工具依赖 | screencapture + osascript | powershell | xdotool + scrot/grim + wmctrl |
|
||||
|
||||
### 3.3 executor 层
|
||||
|
||||
| 功能 | macOS | Windows | Linux |
|
||||
|------|-------|---------|-------|
|
||||
| drainRunLoop | CFRunLoop pump | 不需要 | 不需要 |
|
||||
| ESC 热键 | CGEventTap | 跳过(Ctrl+C fallback) | 跳过(Ctrl+C fallback) |
|
||||
| 剪贴板读 | pbpaste | `powershell Get-Clipboard` | xclip -o / wl-paste |
|
||||
| 剪贴板写 | pbcopy | `powershell Set-Clipboard` | xclip / wl-copy |
|
||||
| 粘贴快捷键 | command+v | ctrl+v | ctrl+v |
|
||||
| 终端检测 | __CFBundleIdentifier | WT_SESSION / TERM_PROGRAM | TERM_PROGRAM |
|
||||
| 系统权限 | TCC check | 直接 granted | 检查 xdotool 安装 |
|
||||
|
||||
## 4. 执行步骤
|
||||
|
||||
### Phase 1:已完成 ✅
|
||||
|
||||
- [x] `@ant/computer-use-mcp` stub → 完整实现
|
||||
- [x] `@ant/computer-use-input` dispatcher + darwin/win32 backends
|
||||
- [x] `@ant/computer-use-swift` dispatcher + darwin/win32 backends
|
||||
- [x] `CHICAGO_MCP` 编译开关
|
||||
|
||||
### Phase 2:移除 6 处 macOS 硬编码(解锁 macOS + Windows)
|
||||
|
||||
**改动原则:macOS 代码路径不变,只在每处 darwin 守卫后加 win32/linux 分支。**
|
||||
|
||||
| 步骤 | 文件 | 改动 |
|
||||
|------|------|------|
|
||||
| 2.1 | `src/main.tsx:1605` | `getPlatform() === 'macos'` → 去掉平台限制,或改为 `!== 'unknown'` |
|
||||
| 2.2 | `src/utils/computerUse/swiftLoader.ts:16-18` | 移除 `process.platform !== 'darwin'` throw。`@ant/computer-use-swift/index.ts` 已有跨平台 dispatch |
|
||||
| 2.3 | `src/utils/computerUse/executor.ts:263-267` | 移除 `process.platform !== 'darwin'` throw。改为检查 input/swift isSupported |
|
||||
| 2.4 | `src/utils/computerUse/executor.ts:70-88` | 剪贴板函数按平台分发:darwin→pbcopy/pbpaste,win32→PowerShell Get/Set-Clipboard,linux→xclip |
|
||||
| 2.5 | `src/utils/computerUse/executor.ts:180` | `typeViaClipboard` 中 `command+v` → 非 darwin 时用 `ctrl+v` |
|
||||
| 2.6 | `src/utils/computerUse/executor.ts:273` | `const cu = requireComputerUseSwift()` → 改为 `new ComputerUseAPI()`(从 package 直接实例化,不走 swiftLoader throw) |
|
||||
| 2.7 | `src/utils/computerUse/drainRunLoop.ts` | 开头加 `if (process.platform !== 'darwin') return fn()` |
|
||||
| 2.8 | `src/utils/computerUse/escHotkey.ts` | `registerEscHotkey` 非 darwin 返回 false(已有 Ctrl+C fallback) |
|
||||
| 2.9 | `src/utils/computerUse/hostAdapter.ts:48-54` | `ensureOsPermissions` 非 darwin 返回 `{ granted: true }` |
|
||||
| 2.10 | `src/utils/computerUse/common.ts:56` | `platform: 'darwin'` → `platform: process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin'` |
|
||||
| 2.11 | `src/utils/computerUse/common.ts:55` | `screenshotFiltering: 'native'` → 非 darwin 时 `'none'`(Windows/Linux 截图不支持 per-app 过滤) |
|
||||
| 2.12 | `src/utils/computerUse/gates.ts:13` | `enabled: false` → `enabled: true`(无 GrowthBook 时默认可用) |
|
||||
| 2.13 | `src/utils/computerUse/gates.ts:39-43` | `hasRequiredSubscription()` → 直接返回 `true` |
|
||||
|
||||
### Phase 3:新增 Linux 后端
|
||||
|
||||
| 步骤 | 文件 | 内容 |
|
||||
|------|------|------|
|
||||
| 3.1 | `packages/@ant/computer-use-input/src/backends/linux.ts` | xdotool 键鼠(mousemove/click/key/type/getactivewindow) |
|
||||
| 3.2 | `packages/@ant/computer-use-swift/src/backends/linux.ts` | scrot/grim 截图 + xrandr 显示器 + wmctrl 窗口管理 |
|
||||
| 3.3 | `packages/@ant/computer-use-input/src/index.ts` | dispatcher 加 `case 'linux'` |
|
||||
| 3.4 | `packages/@ant/computer-use-swift/src/index.ts` | dispatcher 加 `case 'linux'` |
|
||||
|
||||
### Phase 4:验证
|
||||
|
||||
| 测试项 | macOS | Windows | Linux |
|
||||
|--------|-------|---------|-------|
|
||||
| build 成功 | ✅ | 验证 | 验证 |
|
||||
| MCP 工具列表非空 | 验证 | 验证 | 验证 |
|
||||
| 鼠标移动 | 验证 | ✅ 已通过 | 验证 |
|
||||
| 截图 | 验证 | ✅ 已通过 | 验证 |
|
||||
| 键盘输入 | 验证 | 验证 | 验证 |
|
||||
| 前台窗口 | 验证 | ✅ 已通过 | 验证 |
|
||||
| 剪贴板 | 验证 | 验证 | 验证 |
|
||||
|
||||
## 5. 文件改动总览
|
||||
|
||||
### 不动的文件(14 个)
|
||||
|
||||
`cleanup.ts`、`computerUseLock.ts`、`wrapper.tsx`、`toolRendering.tsx`、`mcpServer.ts`、`setup.ts`、`appNames.ts`、`inputLoader.ts`、`src/services/mcp/client.ts`、`@ant/computer-use-mcp/src/*`(Phase 1 已完成)、`backends/darwin.ts`(两个包都不动)
|
||||
|
||||
### 改 src/ 的文件(8 个)
|
||||
|
||||
| 文件 | 改动量 | 风险 |
|
||||
|------|--------|------|
|
||||
| `main.tsx` | 1 行 | 低 |
|
||||
| `swiftLoader.ts` | 2 行 | 低 |
|
||||
| `executor.ts` | ~40 行(剪贴板分发 + 平台守卫 + paste 快捷键) | **中** |
|
||||
| `drainRunLoop.ts` | 1 行 | 低 |
|
||||
| `escHotkey.ts` | 3 行 | 低 |
|
||||
| `hostAdapter.ts` | 5 行 | 低 |
|
||||
| `common.ts` | 3 行 | 低 |
|
||||
| `gates.ts` | 3 行 | 低 |
|
||||
|
||||
### 新增文件(2 个)
|
||||
|
||||
| 文件 | 行数估算 |
|
||||
|------|---------|
|
||||
| `packages/@ant/computer-use-input/src/backends/linux.ts` | ~150 行 |
|
||||
| `packages/@ant/computer-use-swift/src/backends/linux.ts` | ~200 行 |
|
||||
|
||||
## 6. Linux 依赖工具
|
||||
|
||||
| 工具 | 用途 | 安装命令(Ubuntu) |
|
||||
|------|------|-------------------|
|
||||
| `xdotool` | 键鼠模拟 + 窗口管理 | `sudo apt install xdotool` |
|
||||
| `scrot` 或 `gnome-screenshot` | 截图 | `sudo apt install scrot` |
|
||||
| `xrandr` | 显示器信息 | 通常已预装 |
|
||||
| `xclip` | 剪贴板 | `sudo apt install xclip` |
|
||||
| `wmctrl` | 窗口列表/切换 | `sudo apt install wmctrl` |
|
||||
|
||||
Wayland 环境需要替代工具:`ydotool`(替代 xdotool)、`grim`(替代 scrot)、`wl-clipboard`(替代 xclip)。初期可先只支持 X11,Wayland 标记为 todo。
|
||||
|
||||
## 7. 执行顺序建议
|
||||
|
||||
```
|
||||
Phase 2(解锁 macOS + Windows)
|
||||
├── 2.1-2.3 移除 3 处硬编码 throw/skip
|
||||
├── 2.4-2.5 剪贴板 + 粘贴快捷键平台分发
|
||||
├── 2.6 swiftLoader → 直接实例化
|
||||
├── 2.7-2.9 drainRunLoop / escHotkey / permissions 平台分支
|
||||
├── 2.10-2.11 common.ts 平台标识动态化
|
||||
├── 2.12-2.13 gates.ts 默认值
|
||||
└── 验证 Windows
|
||||
|
||||
Phase 3(Linux 后端)
|
||||
├── 3.1 input/backends/linux.ts
|
||||
├── 3.2 swift/backends/linux.ts
|
||||
├── 3.3-3.4 dispatcher 加 linux case
|
||||
└── 验证 Linux
|
||||
|
||||
Phase 4(集成验证 + PR)
|
||||
```
|
||||
|
||||
每个 Phase 可独立验证、独立提交。Phase 2 完成后 macOS + Windows 可用,Phase 3 完成后三平台全部可用。
|
||||
137
packages/@ant/computer-use-input/src/backends/darwin.ts
Normal file
137
packages/@ant/computer-use-input/src/backends/darwin.ts
Normal file
@@ -0,0 +1,137 @@
|
||||
/**
|
||||
* macOS backend for computer-use-input
|
||||
*
|
||||
* Uses AppleScript (osascript) and JXA (JavaScript for Automation) to control
|
||||
* mouse and keyboard via CoreGraphics events and System Events.
|
||||
*/
|
||||
|
||||
import { $ } from 'bun'
|
||||
import type { FrontmostAppInfo, InputBackend } from '../types.js'
|
||||
|
||||
const KEY_MAP: Record<string, number> = {
|
||||
return: 36, enter: 36, tab: 48, space: 49, delete: 51, backspace: 51,
|
||||
escape: 53, esc: 53,
|
||||
left: 123, right: 124, down: 125, up: 126,
|
||||
f1: 122, f2: 120, f3: 99, f4: 118, f5: 96, f6: 97,
|
||||
f7: 98, f8: 100, f9: 101, f10: 109, f11: 103, f12: 111,
|
||||
home: 115, end: 119, pageup: 116, pagedown: 121,
|
||||
}
|
||||
|
||||
const MODIFIER_MAP: Record<string, string> = {
|
||||
command: 'command down', cmd: 'command down', meta: 'command down', super: 'command down',
|
||||
shift: 'shift down',
|
||||
option: 'option down', alt: 'option down',
|
||||
control: 'control down', ctrl: 'control down',
|
||||
}
|
||||
|
||||
async function osascript(script: string): Promise<string> {
|
||||
const result = await $`osascript -e ${script}`.quiet().nothrow().text()
|
||||
return result.trim()
|
||||
}
|
||||
|
||||
async function jxa(script: string): Promise<string> {
|
||||
const result = await $`osascript -l JavaScript -e ${script}`.quiet().nothrow().text()
|
||||
return result.trim()
|
||||
}
|
||||
|
||||
function buildMouseJxa(eventType: string, x: number, y: number, btn: number, clickState?: number): string {
|
||||
let script = `ObjC.import("CoreGraphics"); var p = $.CGPointMake(${x},${y}); var e = $.CGEventCreateMouseEvent(null, $.${eventType}, p, ${btn});`
|
||||
if (clickState !== undefined) {
|
||||
script += ` $.CGEventSetIntegerValueField(e, $.kCGMouseEventClickState, ${clickState});`
|
||||
}
|
||||
script += ` $.CGEventPost($.kCGHIDEventTap, e);`
|
||||
return script
|
||||
}
|
||||
|
||||
export const moveMouse: InputBackend['moveMouse'] = async (x, y, _animated) => {
|
||||
await jxa(buildMouseJxa('kCGEventMouseMoved', x, y, 0))
|
||||
}
|
||||
|
||||
export const key: InputBackend['key'] = async (keyName, action) => {
|
||||
if (action === 'release') return
|
||||
const lower = keyName.toLowerCase()
|
||||
const keyCode = KEY_MAP[lower]
|
||||
if (keyCode !== undefined) {
|
||||
await osascript(`tell application "System Events" to key code ${keyCode}`)
|
||||
} else {
|
||||
await osascript(`tell application "System Events" to keystroke "${keyName.length === 1 ? keyName : lower}"`)
|
||||
}
|
||||
}
|
||||
|
||||
export const keys: InputBackend['keys'] = async (parts) => {
|
||||
const modifiers: string[] = []
|
||||
let finalKey: string | null = null
|
||||
for (const part of parts) {
|
||||
const mod = MODIFIER_MAP[part.toLowerCase()]
|
||||
if (mod) modifiers.push(mod)
|
||||
else finalKey = part
|
||||
}
|
||||
if (!finalKey) return
|
||||
const lower = finalKey.toLowerCase()
|
||||
const keyCode = KEY_MAP[lower]
|
||||
const modStr = modifiers.length > 0 ? ` using {${modifiers.join(', ')}}` : ''
|
||||
if (keyCode !== undefined) {
|
||||
await osascript(`tell application "System Events" to key code ${keyCode}${modStr}`)
|
||||
} else {
|
||||
await osascript(`tell application "System Events" to keystroke "${finalKey.length === 1 ? finalKey : lower}"${modStr}`)
|
||||
}
|
||||
}
|
||||
|
||||
export const mouseLocation: InputBackend['mouseLocation'] = async () => {
|
||||
const result = await jxa('ObjC.import("CoreGraphics"); var e = $.CGEventCreate(null); var p = $.CGEventGetLocation(e); p.x + "," + p.y')
|
||||
const [xStr, yStr] = result.split(',')
|
||||
return { x: Math.round(Number(xStr)), y: Math.round(Number(yStr)) }
|
||||
}
|
||||
|
||||
export const mouseButton: InputBackend['mouseButton'] = async (button, action, count) => {
|
||||
const pos = await mouseLocation()
|
||||
const btn = button === 'left' ? 0 : button === 'right' ? 1 : 2
|
||||
const downType = btn === 0 ? 'kCGEventLeftMouseDown' : btn === 1 ? 'kCGEventRightMouseDown' : 'kCGEventOtherMouseDown'
|
||||
const upType = btn === 0 ? 'kCGEventLeftMouseUp' : btn === 1 ? 'kCGEventRightMouseUp' : 'kCGEventOtherMouseUp'
|
||||
|
||||
if (action === 'click') {
|
||||
for (let i = 0; i < (count ?? 1); i++) {
|
||||
await jxa(buildMouseJxa(downType, pos.x, pos.y, btn, i + 1))
|
||||
await jxa(buildMouseJxa(upType, pos.x, pos.y, btn, i + 1))
|
||||
}
|
||||
} else if (action === 'press') {
|
||||
await jxa(buildMouseJxa(downType, pos.x, pos.y, btn))
|
||||
} else {
|
||||
await jxa(buildMouseJxa(upType, pos.x, pos.y, btn))
|
||||
}
|
||||
}
|
||||
|
||||
export const mouseScroll: InputBackend['mouseScroll'] = async (amount, direction) => {
|
||||
const script = direction === 'vertical'
|
||||
? `ObjC.import("CoreGraphics"); var e = $.CGEventCreateScrollWheelEvent(null, 0, 1, ${amount}); $.CGEventPost($.kCGHIDEventTap, e);`
|
||||
: `ObjC.import("CoreGraphics"); var e = $.CGEventCreateScrollWheelEvent(null, 0, 2, 0, ${amount}); $.CGEventPost($.kCGHIDEventTap, e);`
|
||||
await jxa(script)
|
||||
}
|
||||
|
||||
export const typeText: InputBackend['typeText'] = async (text) => {
|
||||
const escaped = text.replace(/\\/g, '\\\\').replace(/"/g, '\\"')
|
||||
await osascript(`tell application "System Events" to keystroke "${escaped}"`)
|
||||
}
|
||||
|
||||
export const getFrontmostAppInfo: InputBackend['getFrontmostAppInfo'] = () => {
|
||||
try {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['osascript', '-e', `
|
||||
tell application "System Events"
|
||||
set frontApp to first application process whose frontmost is true
|
||||
set appName to name of frontApp
|
||||
set bundleId to bundle identifier of frontApp
|
||||
return bundleId & "|" & appName
|
||||
end tell
|
||||
`],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
const output = new TextDecoder().decode(result.stdout).trim()
|
||||
if (!output || !output.includes('|')) return null
|
||||
const [bundleId, appName] = output.split('|', 2)
|
||||
return { bundleId: bundleId!, appName: appName! }
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
173
packages/@ant/computer-use-input/src/backends/linux.ts
Normal file
173
packages/@ant/computer-use-input/src/backends/linux.ts
Normal file
@@ -0,0 +1,173 @@
|
||||
/**
|
||||
* Linux backend for computer-use-input
|
||||
*
|
||||
* Uses xdotool for mouse and keyboard simulation.
|
||||
* Requires: xdotool (apt install xdotool)
|
||||
*/
|
||||
|
||||
import type { FrontmostAppInfo, InputBackend } from '../types.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Shell helper — run a command and return trimmed stdout
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function run(cmd: string[]): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd,
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
async function runAsync(cmd: string[]): Promise<string> {
|
||||
const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' })
|
||||
const out = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return out.trim()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// xdotool key name mapping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const KEY_MAP: Record<string, string> = {
|
||||
return: 'Return', enter: 'Return', tab: 'Tab', space: 'space',
|
||||
backspace: 'BackSpace', delete: 'Delete', escape: 'Escape', esc: 'Escape',
|
||||
left: 'Left', up: 'Up', right: 'Right', down: 'Down',
|
||||
home: 'Home', end: 'End', pageup: 'Prior', pagedown: 'Next',
|
||||
f1: 'F1', f2: 'F2', f3: 'F3', f4: 'F4', f5: 'F5', f6: 'F6',
|
||||
f7: 'F7', f8: 'F8', f9: 'F9', f10: 'F10', f11: 'F11', f12: 'F12',
|
||||
shift: 'shift', lshift: 'shift', rshift: 'shift',
|
||||
control: 'ctrl', ctrl: 'ctrl', lcontrol: 'ctrl', rcontrol: 'ctrl',
|
||||
alt: 'alt', option: 'alt', lalt: 'alt', ralt: 'alt',
|
||||
win: 'super', meta: 'super', command: 'super', cmd: 'super', super: 'super',
|
||||
insert: 'Insert', printscreen: 'Print', pause: 'Pause',
|
||||
numlock: 'Num_Lock', capslock: 'Caps_Lock', scrolllock: 'Scroll_Lock',
|
||||
}
|
||||
|
||||
const MODIFIER_KEYS = new Set([
|
||||
'shift', 'lshift', 'rshift', 'control', 'ctrl', 'lcontrol', 'rcontrol',
|
||||
'alt', 'option', 'lalt', 'ralt', 'win', 'meta', 'command', 'cmd', 'super',
|
||||
])
|
||||
|
||||
function mapKey(name: string): string {
|
||||
return KEY_MAP[name.toLowerCase()] ?? name
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// xdotool mouse button mapping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function mouseButtonNum(button: 'left' | 'right' | 'middle'): string {
|
||||
return button === 'left' ? '1' : button === 'right' ? '3' : '2'
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Implementation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const moveMouse: InputBackend['moveMouse'] = async (x, y, _animated) => {
|
||||
run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))])
|
||||
}
|
||||
|
||||
export const mouseLocation: InputBackend['mouseLocation'] = async () => {
|
||||
const out = run(['xdotool', 'getmouselocation'])
|
||||
// Output format: "x:123 y:456 screen:0 window:12345678"
|
||||
const xMatch = out.match(/x:(\d+)/)
|
||||
const yMatch = out.match(/y:(\d+)/)
|
||||
return {
|
||||
x: xMatch ? Number(xMatch[1]) : 0,
|
||||
y: yMatch ? Number(yMatch[1]) : 0,
|
||||
}
|
||||
}
|
||||
|
||||
export const mouseButton: InputBackend['mouseButton'] = async (button, action, count) => {
|
||||
const btn = mouseButtonNum(button)
|
||||
if (action === 'click') {
|
||||
const n = count ?? 1
|
||||
run(['xdotool', 'click', '--repeat', String(n), btn])
|
||||
} else if (action === 'press') {
|
||||
run(['xdotool', 'mousedown', btn])
|
||||
} else {
|
||||
run(['xdotool', 'mouseup', btn])
|
||||
}
|
||||
}
|
||||
|
||||
export const mouseScroll: InputBackend['mouseScroll'] = async (amount, direction) => {
|
||||
// xdotool click 4=scroll up, 5=scroll down, 6=scroll left, 7=scroll right
|
||||
// Positive amount = down/right, negative = up/left
|
||||
if (direction === 'vertical') {
|
||||
const btn = amount >= 0 ? '5' : '4'
|
||||
const repeats = Math.abs(Math.round(amount))
|
||||
if (repeats > 0) {
|
||||
run(['xdotool', 'click', '--repeat', String(repeats), btn])
|
||||
}
|
||||
} else {
|
||||
const btn = amount >= 0 ? '7' : '6'
|
||||
const repeats = Math.abs(Math.round(amount))
|
||||
if (repeats > 0) {
|
||||
run(['xdotool', 'click', '--repeat', String(repeats), btn])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export const key: InputBackend['key'] = async (keyName, action) => {
|
||||
const mapped = mapKey(keyName)
|
||||
if (action === 'press') {
|
||||
run(['xdotool', 'keydown', mapped])
|
||||
} else {
|
||||
run(['xdotool', 'keyup', mapped])
|
||||
}
|
||||
}
|
||||
|
||||
export const keys: InputBackend['keys'] = async (parts) => {
|
||||
// xdotool key accepts "modifier+modifier+key" format
|
||||
const modifiers: string[] = []
|
||||
let finalKey: string | null = null
|
||||
|
||||
for (const part of parts) {
|
||||
if (MODIFIER_KEYS.has(part.toLowerCase())) {
|
||||
modifiers.push(mapKey(part))
|
||||
} else {
|
||||
finalKey = part
|
||||
}
|
||||
}
|
||||
if (!finalKey) return
|
||||
|
||||
const combo = [...modifiers, mapKey(finalKey)].join('+')
|
||||
run(['xdotool', 'key', combo])
|
||||
}
|
||||
|
||||
export const typeText: InputBackend['typeText'] = async (text) => {
|
||||
run(['xdotool', 'type', '--delay', '12', text])
|
||||
}
|
||||
|
||||
export const getFrontmostAppInfo: InputBackend['getFrontmostAppInfo'] = () => {
|
||||
try {
|
||||
const windowId = run(['xdotool', 'getactivewindow'])
|
||||
if (!windowId) return null
|
||||
|
||||
const pidStr = run(['xdotool', 'getwindowpid', windowId])
|
||||
if (!pidStr) return null
|
||||
|
||||
const pid = pidStr.trim()
|
||||
|
||||
// Read the executable path from /proc
|
||||
let exePath = ''
|
||||
try {
|
||||
exePath = run(['readlink', '-f', `/proc/${pid}/exe`])
|
||||
} catch { /* ignore */ }
|
||||
|
||||
// Read the process name from /proc/comm
|
||||
let appName = ''
|
||||
try {
|
||||
appName = run(['cat', `/proc/${pid}/comm`])
|
||||
} catch { /* ignore */ }
|
||||
|
||||
if (!exePath && !appName) return null
|
||||
return { bundleId: exePath || `/proc/${pid}/exe`, appName: appName || 'unknown' }
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
218
packages/@ant/computer-use-input/src/backends/win32.ts
Normal file
218
packages/@ant/computer-use-input/src/backends/win32.ts
Normal file
@@ -0,0 +1,218 @@
|
||||
/**
|
||||
* Windows backend for computer-use-input
|
||||
*
|
||||
* Uses PowerShell with Win32 P/Invoke (SetCursorPos, SendInput, keybd_event,
|
||||
* GetForegroundWindow) to control mouse and keyboard.
|
||||
*
|
||||
* All P/Invoke types are compiled once at module load and reused across calls.
|
||||
*/
|
||||
|
||||
import type { FrontmostAppInfo, InputBackend } from '../types.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// PowerShell helper — run a script and return trimmed stdout
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function ps(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
async function psAsync(script: string): Promise<string> {
|
||||
const proc = Bun.spawn(
|
||||
['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
{ stdout: 'pipe', stderr: 'pipe' },
|
||||
)
|
||||
const out = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return out.trim()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// P/Invoke type definitions (compiled once, cached by PowerShell session)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const WIN32_TYPES = `
|
||||
Add-Type -Language CSharp @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
using System.Diagnostics;
|
||||
|
||||
public class CuWin32 {
|
||||
// --- Cursor ---
|
||||
[DllImport("user32.dll")] public static extern bool SetCursorPos(int X, int Y);
|
||||
[DllImport("user32.dll")] public static extern bool GetCursorPos(out POINT p);
|
||||
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X; public int Y; }
|
||||
|
||||
// --- SendInput ---
|
||||
[StructLayout(LayoutKind.Sequential)] public struct MOUSEINPUT {
|
||||
public int dx; public int dy; public int mouseData; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
|
||||
}
|
||||
[StructLayout(LayoutKind.Explicit)] public struct INPUT {
|
||||
[FieldOffset(0)] public uint type;
|
||||
[FieldOffset(4)] public MOUSEINPUT mi;
|
||||
}
|
||||
[StructLayout(LayoutKind.Sequential)] public struct KEYBDINPUT {
|
||||
public ushort wVk; public ushort wScan; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
|
||||
}
|
||||
[StructLayout(LayoutKind.Explicit)] public struct KINPUT {
|
||||
[FieldOffset(0)] public uint type;
|
||||
[FieldOffset(4)] public KEYBDINPUT ki;
|
||||
}
|
||||
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, INPUT[] i, int cb);
|
||||
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, KINPUT[] i, int cb);
|
||||
|
||||
// --- Keyboard ---
|
||||
[DllImport("user32.dll")] public static extern void keybd_event(byte bVk, byte bScan, uint dwFlags, UIntPtr dwExtraInfo);
|
||||
[DllImport("user32.dll")] public static extern short VkKeyScan(char ch);
|
||||
|
||||
// --- Window ---
|
||||
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
|
||||
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint pid);
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr hWnd, StringBuilder sb, int max);
|
||||
|
||||
// Constants
|
||||
public const uint INPUT_MOUSE = 0, INPUT_KEYBOARD = 1;
|
||||
public const uint MOUSEEVENTF_LEFTDOWN = 0x0002, MOUSEEVENTF_LEFTUP = 0x0004;
|
||||
public const uint MOUSEEVENTF_RIGHTDOWN = 0x0008, MOUSEEVENTF_RIGHTUP = 0x0010;
|
||||
public const uint MOUSEEVENTF_MIDDLEDOWN = 0x0020, MOUSEEVENTF_MIDDLEUP = 0x0040;
|
||||
public const uint MOUSEEVENTF_WHEEL = 0x0800, MOUSEEVENTF_HWHEEL = 0x1000;
|
||||
public const uint KEYEVENTF_KEYUP = 0x0002;
|
||||
}
|
||||
'@
|
||||
`
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Virtual key code mapping
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const VK_MAP: Record<string, number> = {
|
||||
return: 0x0D, enter: 0x0D, tab: 0x09, space: 0x20,
|
||||
backspace: 0x08, delete: 0x2E, escape: 0x1B, esc: 0x1B,
|
||||
left: 0x25, up: 0x26, right: 0x27, down: 0x28,
|
||||
home: 0x24, end: 0x23, pageup: 0x21, pagedown: 0x22,
|
||||
f1: 0x70, f2: 0x71, f3: 0x72, f4: 0x73, f5: 0x74, f6: 0x75,
|
||||
f7: 0x76, f8: 0x77, f9: 0x78, f10: 0x79, f11: 0x7A, f12: 0x7B,
|
||||
shift: 0xA0, lshift: 0xA0, rshift: 0xA1,
|
||||
control: 0xA2, ctrl: 0xA2, lcontrol: 0xA2, rcontrol: 0xA3,
|
||||
alt: 0xA4, option: 0xA4, lalt: 0xA4, ralt: 0xA5,
|
||||
win: 0x5B, meta: 0x5B, command: 0x5B, cmd: 0x5B, super: 0x5B,
|
||||
insert: 0x2D, printscreen: 0x2C, pause: 0x13,
|
||||
numlock: 0x90, capslock: 0x14, scrolllock: 0x91,
|
||||
}
|
||||
|
||||
const MODIFIER_KEYS = new Set(['shift', 'lshift', 'rshift', 'control', 'ctrl', 'lcontrol', 'rcontrol', 'alt', 'option', 'lalt', 'ralt', 'win', 'meta', 'command', 'cmd', 'super'])
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Implementation
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const moveMouse: InputBackend['moveMouse'] = async (x, y, _animated) => {
|
||||
ps(`${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null`)
|
||||
}
|
||||
|
||||
export const mouseLocation: InputBackend['mouseLocation'] = async () => {
|
||||
const out = ps(`${WIN32_TYPES}; $p = New-Object CuWin32+POINT; [CuWin32]::GetCursorPos([ref]$p) | Out-Null; "$($p.X),$($p.Y)"`)
|
||||
const [xStr, yStr] = out.split(',')
|
||||
return { x: Number(xStr), y: Number(yStr) }
|
||||
}
|
||||
|
||||
export const mouseButton: InputBackend['mouseButton'] = async (button, action, count) => {
|
||||
const downFlag = button === 'left' ? 'MOUSEEVENTF_LEFTDOWN'
|
||||
: button === 'right' ? 'MOUSEEVENTF_RIGHTDOWN'
|
||||
: 'MOUSEEVENTF_MIDDLEDOWN'
|
||||
const upFlag = button === 'left' ? 'MOUSEEVENTF_LEFTUP'
|
||||
: button === 'right' ? 'MOUSEEVENTF_RIGHTUP'
|
||||
: 'MOUSEEVENTF_MIDDLEUP'
|
||||
|
||||
if (action === 'click') {
|
||||
const n = count ?? 1
|
||||
let clicks = ''
|
||||
for (let i = 0; i < n; i++) {
|
||||
clicks += `$i.mi.dwFlags=[CuWin32]::${downFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null; $i.mi.dwFlags=[CuWin32]::${upFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null; `
|
||||
}
|
||||
ps(`${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; ${clicks}`)
|
||||
} else if (action === 'press') {
|
||||
ps(`${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${downFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`)
|
||||
} else {
|
||||
ps(`${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${upFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`)
|
||||
}
|
||||
}
|
||||
|
||||
export const mouseScroll: InputBackend['mouseScroll'] = async (amount, direction) => {
|
||||
const flag = direction === 'vertical' ? 'MOUSEEVENTF_WHEEL' : 'MOUSEEVENTF_HWHEEL'
|
||||
ps(`${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${flag}; $i.mi.mouseData=${amount * 120}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`)
|
||||
}
|
||||
|
||||
export const key: InputBackend['key'] = async (keyName, action) => {
|
||||
const lower = keyName.toLowerCase()
|
||||
const vk = VK_MAP[lower]
|
||||
const flags = action === 'release' ? '2' : '0'
|
||||
if (vk !== undefined) {
|
||||
ps(`${WIN32_TYPES}; [CuWin32]::keybd_event(${vk}, 0, ${flags}, [UIntPtr]::Zero)`)
|
||||
} else if (keyName.length === 1) {
|
||||
// Single character — use VkKeyScan to resolve
|
||||
const charCode = keyName.charCodeAt(0)
|
||||
ps(`${WIN32_TYPES}; $vk = [CuWin32]::VkKeyScan([char]${charCode}) -band 0xFF; [CuWin32]::keybd_event([byte]$vk, 0, ${flags}, [UIntPtr]::Zero)`)
|
||||
}
|
||||
}
|
||||
|
||||
export const keys: InputBackend['keys'] = async (parts) => {
|
||||
const modifiers: number[] = []
|
||||
let finalKey: string | null = null
|
||||
|
||||
for (const part of parts) {
|
||||
const lower = part.toLowerCase()
|
||||
if (MODIFIER_KEYS.has(lower)) {
|
||||
const vk = VK_MAP[lower]
|
||||
if (vk !== undefined) modifiers.push(vk)
|
||||
} else {
|
||||
finalKey = part
|
||||
}
|
||||
}
|
||||
if (!finalKey) return
|
||||
|
||||
// Build script: press modifiers → press key → release key → release modifiers
|
||||
let script = WIN32_TYPES + '; '
|
||||
for (const vk of modifiers) {
|
||||
script += `[CuWin32]::keybd_event(${vk}, 0, 0, [UIntPtr]::Zero); `
|
||||
}
|
||||
const lower = finalKey.toLowerCase()
|
||||
const vk = VK_MAP[lower]
|
||||
if (vk !== undefined) {
|
||||
script += `[CuWin32]::keybd_event(${vk}, 0, 0, [UIntPtr]::Zero); [CuWin32]::keybd_event(${vk}, 0, 2, [UIntPtr]::Zero); `
|
||||
} else if (finalKey.length === 1) {
|
||||
const charCode = finalKey.charCodeAt(0)
|
||||
script += `$vk = [CuWin32]::VkKeyScan([char]${charCode}) -band 0xFF; [CuWin32]::keybd_event([byte]$vk, 0, 0, [UIntPtr]::Zero); [CuWin32]::keybd_event([byte]$vk, 0, 2, [UIntPtr]::Zero); `
|
||||
}
|
||||
for (const mk of modifiers.reverse()) {
|
||||
script += `[CuWin32]::keybd_event(${mk}, 0, 2, [UIntPtr]::Zero); `
|
||||
}
|
||||
ps(script)
|
||||
}
|
||||
|
||||
export const typeText: InputBackend['typeText'] = async (text) => {
|
||||
const escaped = text.replace(/'/g, "''")
|
||||
ps(`Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait('${escaped}')`)
|
||||
}
|
||||
|
||||
export const getFrontmostAppInfo: InputBackend['getFrontmostAppInfo'] = () => {
|
||||
try {
|
||||
const out = ps(`${WIN32_TYPES}
|
||||
$hwnd = [CuWin32]::GetForegroundWindow()
|
||||
$procId = [uint32]0
|
||||
[CuWin32]::GetWindowThreadProcessId($hwnd, [ref]$procId) | Out-Null
|
||||
$proc = Get-Process -Id $procId -ErrorAction SilentlyContinue
|
||||
"$($proc.MainModule.FileName)|$($proc.ProcessName)"`)
|
||||
if (!out || !out.includes('|')) return null
|
||||
const [exePath, appName] = out.split('|', 2)
|
||||
return { bundleId: exePath!, appName: appName! }
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
@@ -1,174 +1,73 @@
|
||||
/**
|
||||
* @ant/computer-use-input — macOS 键鼠模拟实现
|
||||
* @ant/computer-use-input — cross-platform keyboard & mouse simulation
|
||||
*
|
||||
* 使用 macOS 原生工具实现:
|
||||
* - AppleScript (osascript) — 应用信息、键盘输入
|
||||
* - CGEvent via AppleScript-ObjC bridge — 鼠标操作、位置查询
|
||||
* Platform backends:
|
||||
* - darwin: AppleScript/JXA via CoreGraphics events
|
||||
* - win32: PowerShell via Win32 P/Invoke (SetCursorPos, SendInput, keybd_event)
|
||||
*
|
||||
* 仅 macOS 支持。其他平台返回 { isSupported: false }
|
||||
* Add new platforms by creating backends/<platform>.ts implementing InputBackend.
|
||||
*/
|
||||
|
||||
import { $ } from 'bun'
|
||||
import type { FrontmostAppInfo, InputBackend } from './types.js'
|
||||
|
||||
interface FrontmostAppInfo {
|
||||
bundleId: string
|
||||
appName: string
|
||||
}
|
||||
export type { FrontmostAppInfo, InputBackend } from './types.js'
|
||||
|
||||
// AppleScript key code mapping
|
||||
const KEY_MAP: Record<string, number> = {
|
||||
return: 36, enter: 36, tab: 48, space: 49, delete: 51, backspace: 51,
|
||||
escape: 53, esc: 53,
|
||||
left: 123, right: 124, down: 125, up: 126,
|
||||
f1: 122, f2: 120, f3: 99, f4: 118, f5: 96, f6: 97,
|
||||
f7: 98, f8: 100, f9: 101, f10: 109, f11: 103, f12: 111,
|
||||
home: 115, end: 119, pageup: 116, pagedown: 121,
|
||||
}
|
||||
// ---------------------------------------------------------------------------
|
||||
// Platform dispatch
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const MODIFIER_MAP: Record<string, string> = {
|
||||
command: 'command down', cmd: 'command down', meta: 'command down', super: 'command down',
|
||||
shift: 'shift down',
|
||||
option: 'option down', alt: 'option down',
|
||||
control: 'control down', ctrl: 'control down',
|
||||
}
|
||||
|
||||
async function osascript(script: string): Promise<string> {
|
||||
const result = await $`osascript -e ${script}`.quiet().nothrow().text()
|
||||
return result.trim()
|
||||
}
|
||||
|
||||
async function jxa(script: string): Promise<string> {
|
||||
const result = await $`osascript -l JavaScript -e ${script}`.quiet().nothrow().text()
|
||||
return result.trim()
|
||||
}
|
||||
|
||||
function jxaSync(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['osascript', '-l', 'JavaScript', '-e', script],
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
function buildMouseJxa(eventType: string, x: number, y: number, btn: number, clickState?: number): string {
|
||||
let script = `ObjC.import("CoreGraphics"); var p = $.CGPointMake(${x},${y}); var e = $.CGEventCreateMouseEvent(null, $.${eventType}, p, ${btn});`
|
||||
if (clickState !== undefined) {
|
||||
script += ` $.CGEventSetIntegerValueField(e, $.kCGMouseEventClickState, ${clickState});`
|
||||
}
|
||||
script += ` $.CGEventPost($.kCGHIDEventTap, e);`
|
||||
return script
|
||||
}
|
||||
|
||||
// ---- Implementation functions ----
|
||||
|
||||
async function moveMouse(x: number, y: number, _animated: boolean): Promise<void> {
|
||||
await jxa(buildMouseJxa('kCGEventMouseMoved', x, y, 0))
|
||||
}
|
||||
|
||||
async function key(keyName: string, action: 'press' | 'release'): Promise<void> {
|
||||
if (action === 'release') return
|
||||
const lower = keyName.toLowerCase()
|
||||
const keyCode = KEY_MAP[lower]
|
||||
if (keyCode !== undefined) {
|
||||
await osascript(`tell application "System Events" to key code ${keyCode}`)
|
||||
} else {
|
||||
await osascript(`tell application "System Events" to keystroke "${keyName.length === 1 ? keyName : lower}"`)
|
||||
}
|
||||
}
|
||||
|
||||
async function keys(parts: string[]): Promise<void> {
|
||||
const modifiers: string[] = []
|
||||
let finalKey: string | null = null
|
||||
for (const part of parts) {
|
||||
const mod = MODIFIER_MAP[part.toLowerCase()]
|
||||
if (mod) modifiers.push(mod)
|
||||
else finalKey = part
|
||||
}
|
||||
if (!finalKey) return
|
||||
const lower = finalKey.toLowerCase()
|
||||
const keyCode = KEY_MAP[lower]
|
||||
const modStr = modifiers.length > 0 ? ` using {${modifiers.join(', ')}}` : ''
|
||||
if (keyCode !== undefined) {
|
||||
await osascript(`tell application "System Events" to key code ${keyCode}${modStr}`)
|
||||
} else {
|
||||
await osascript(`tell application "System Events" to keystroke "${finalKey.length === 1 ? finalKey : lower}"${modStr}`)
|
||||
}
|
||||
}
|
||||
|
||||
async function mouseLocation(): Promise<{ x: number; y: number }> {
|
||||
const result = await jxa('ObjC.import("CoreGraphics"); var e = $.CGEventCreate(null); var p = $.CGEventGetLocation(e); p.x + "," + p.y')
|
||||
const [xStr, yStr] = result.split(',')
|
||||
return { x: Math.round(Number(xStr)), y: Math.round(Number(yStr)) }
|
||||
}
|
||||
|
||||
async function mouseButton(
|
||||
button: 'left' | 'right' | 'middle',
|
||||
action: 'click' | 'press' | 'release',
|
||||
count?: number,
|
||||
): Promise<void> {
|
||||
const pos = await mouseLocation()
|
||||
const btn = button === 'left' ? 0 : button === 'right' ? 1 : 2
|
||||
const downType = btn === 0 ? 'kCGEventLeftMouseDown' : btn === 1 ? 'kCGEventRightMouseDown' : 'kCGEventOtherMouseDown'
|
||||
const upType = btn === 0 ? 'kCGEventLeftMouseUp' : btn === 1 ? 'kCGEventRightMouseUp' : 'kCGEventOtherMouseUp'
|
||||
|
||||
if (action === 'click') {
|
||||
for (let i = 0; i < (count ?? 1); i++) {
|
||||
await jxa(buildMouseJxa(downType, pos.x, pos.y, btn, i + 1))
|
||||
await jxa(buildMouseJxa(upType, pos.x, pos.y, btn, i + 1))
|
||||
}
|
||||
} else if (action === 'press') {
|
||||
await jxa(buildMouseJxa(downType, pos.x, pos.y, btn))
|
||||
} else {
|
||||
await jxa(buildMouseJxa(upType, pos.x, pos.y, btn))
|
||||
}
|
||||
}
|
||||
|
||||
async function mouseScroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void> {
|
||||
const script = direction === 'vertical'
|
||||
? `ObjC.import("CoreGraphics"); var e = $.CGEventCreateScrollWheelEvent(null, 0, 1, ${amount}); $.CGEventPost($.kCGHIDEventTap, e);`
|
||||
: `ObjC.import("CoreGraphics"); var e = $.CGEventCreateScrollWheelEvent(null, 0, 2, 0, ${amount}); $.CGEventPost($.kCGHIDEventTap, e);`
|
||||
await jxa(script)
|
||||
}
|
||||
|
||||
async function typeText(text: string): Promise<void> {
|
||||
const escaped = text.replace(/\\/g, '\\\\').replace(/"/g, '\\"')
|
||||
await osascript(`tell application "System Events" to keystroke "${escaped}"`)
|
||||
}
|
||||
|
||||
function getFrontmostAppInfo(): FrontmostAppInfo | null {
|
||||
function loadBackend(): InputBackend | null {
|
||||
try {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['osascript', '-e', `
|
||||
tell application "System Events"
|
||||
set frontApp to first application process whose frontmost is true
|
||||
set appName to name of frontApp
|
||||
set bundleId to bundle identifier of frontApp
|
||||
return bundleId & "|" & appName
|
||||
end tell
|
||||
`],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
const output = new TextDecoder().decode(result.stdout).trim()
|
||||
if (!output || !output.includes('|')) return null
|
||||
const [bundleId, appName] = output.split('|', 2)
|
||||
return { bundleId: bundleId!, appName: appName! }
|
||||
switch (process.platform) {
|
||||
case 'darwin':
|
||||
return require('./backends/darwin.js') as InputBackend
|
||||
case 'win32':
|
||||
return require('./backends/win32.js') as InputBackend
|
||||
case 'linux':
|
||||
return require('./backends/linux.js') as InputBackend
|
||||
default:
|
||||
return null
|
||||
}
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
// ---- Exports ----
|
||||
const backend = loadBackend()
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Unsupported stub (throws on call — guards via isSupported check)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function unsupported(): never {
|
||||
throw new Error(`computer-use-input is not supported on ${process.platform}`)
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public API — matches the original export surface
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const isSupported = backend !== null
|
||||
|
||||
export const moveMouse = backend?.moveMouse ?? unsupported
|
||||
export const key = backend?.key ?? unsupported
|
||||
export const keys = backend?.keys ?? unsupported
|
||||
export const mouseLocation = backend?.mouseLocation ?? unsupported
|
||||
export const mouseButton = backend?.mouseButton ?? unsupported
|
||||
export const mouseScroll = backend?.mouseScroll ?? unsupported
|
||||
export const typeText = backend?.typeText ?? unsupported
|
||||
export const getFrontmostAppInfo = backend?.getFrontmostAppInfo ?? (() => null)
|
||||
|
||||
// Legacy class type — used by inputLoader.ts for type narrowing
|
||||
export class ComputerUseInputAPI {
|
||||
declare moveMouse: (x: number, y: number, animated: boolean) => Promise<void>
|
||||
declare key: (key: string, action: 'press' | 'release') => Promise<void>
|
||||
declare keys: (parts: string[]) => Promise<void>
|
||||
declare mouseLocation: () => Promise<{ x: number; y: number }>
|
||||
declare mouseButton: (button: 'left' | 'right' | 'middle', action: 'click' | 'press' | 'release', count?: number) => Promise<void>
|
||||
declare mouseScroll: (amount: number, direction: 'vertical' | 'horizontal') => Promise<void>
|
||||
declare typeText: (text: string) => Promise<void>
|
||||
declare getFrontmostAppInfo: () => FrontmostAppInfo | null
|
||||
declare moveMouse: InputBackend['moveMouse']
|
||||
declare key: InputBackend['key']
|
||||
declare keys: InputBackend['keys']
|
||||
declare mouseLocation: InputBackend['mouseLocation']
|
||||
declare mouseButton: InputBackend['mouseButton']
|
||||
declare mouseScroll: InputBackend['mouseScroll']
|
||||
declare typeText: InputBackend['typeText']
|
||||
declare getFrontmostAppInfo: InputBackend['getFrontmostAppInfo']
|
||||
declare isSupported: true
|
||||
}
|
||||
|
||||
@@ -177,7 +76,3 @@ interface ComputerUseInputUnsupported {
|
||||
}
|
||||
|
||||
export type ComputerUseInput = ComputerUseInputAPI | ComputerUseInputUnsupported
|
||||
|
||||
// Plain object with all methods as own properties — compatible with require()
|
||||
export const isSupported = process.platform === 'darwin'
|
||||
export { moveMouse, key, keys, mouseLocation, mouseButton, mouseScroll, typeText, getFrontmostAppInfo }
|
||||
|
||||
19
packages/@ant/computer-use-input/src/types.ts
Normal file
19
packages/@ant/computer-use-input/src/types.ts
Normal file
@@ -0,0 +1,19 @@
|
||||
export interface FrontmostAppInfo {
|
||||
bundleId: string // macOS: bundle ID, Windows: exe path
|
||||
appName: string
|
||||
}
|
||||
|
||||
export interface InputBackend {
|
||||
moveMouse(x: number, y: number, animated: boolean): Promise<void>
|
||||
key(key: string, action: 'press' | 'release'): Promise<void>
|
||||
keys(parts: string[]): Promise<void>
|
||||
mouseLocation(): Promise<{ x: number; y: number }>
|
||||
mouseButton(
|
||||
button: 'left' | 'right' | 'middle',
|
||||
action: 'click' | 'press' | 'release',
|
||||
count?: number,
|
||||
): Promise<void>
|
||||
mouseScroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
|
||||
typeText(text: string): Promise<void>
|
||||
getFrontmostAppInfo(): FrontmostAppInfo | null
|
||||
}
|
||||
553
packages/@ant/computer-use-mcp/src/deniedApps.ts
Normal file
553
packages/@ant/computer-use-mcp/src/deniedApps.ts
Normal file
@@ -0,0 +1,553 @@
|
||||
/**
|
||||
* App category lookup for tiered CU permissions. Three categories land at a
|
||||
* restricted tier instead of `"full"`:
|
||||
*
|
||||
* - **browser** → `"read"` tier — visible in screenshots, NO interaction.
|
||||
* The model can read an already-open page but must use the Claude-in-Chrome
|
||||
* MCP for navigation/clicking/typing.
|
||||
* - **terminal** → `"click"` tier — visible + clickable, NO typing. The
|
||||
* model can click a Run button or scroll test output in an IDE, but can't
|
||||
* type into the integrated terminal. Use the Bash tool for shell work.
|
||||
* - **trading** → `"read"` tier — same restrictions as browsers, but no
|
||||
* CiC-MCP alternative exists. For platforms where a stray click can
|
||||
* execute a trade or send a message to a counterparty.
|
||||
*
|
||||
* Uncategorized apps default to `"full"`. See `getDefaultTierForApp`.
|
||||
*
|
||||
* Identification is two-layered:
|
||||
* 1. Bundle ID match (macOS-only; `InstalledApp.bundleId` is a
|
||||
* CFBundleIdentifier and meaningless on Windows). Fast, exact, the
|
||||
* primary mechanism while CU is darwin-gated.
|
||||
* 2. Display-name substring match (cross-platform fallback). Catches
|
||||
* unresolved requests ("Chrome" when Chrome isn't installed) AND will
|
||||
* be the primary mechanism on Windows/Linux where there's no bundle ID.
|
||||
* Windows-relevant names (PowerShell, cmd, Windows Terminal) are
|
||||
* included now so they activate the moment the darwin gate lifts.
|
||||
*
|
||||
* Keep this file **import-free** (like sentinelApps.ts) — the renderer may
|
||||
* import it via a package.json subpath export, and pulling in
|
||||
* `@modelcontextprotocol/sdk` (a devDep) through the index → mcpServer chain
|
||||
* would fail module resolution in Next.js. The `CuAppPermTier` type is
|
||||
* duplicated as a string literal below rather than imported.
|
||||
*/
|
||||
|
||||
export type DeniedCategory = "browser" | "terminal" | "trading";
|
||||
|
||||
/**
|
||||
* Map a category to its hardcoded tier. Return-type is the string-literal
|
||||
* union inline (this file is import-free; see header comment). The
|
||||
* authoritative type is `CuAppPermTier` in types.ts — keep in sync.
|
||||
*
|
||||
* Not bijective — both `"browser"` and `"trading"` map to `"read"`. Copy
|
||||
* that differs by category (the "use CiC" hint is browser-only) must check
|
||||
* the category, not just the tier.
|
||||
*/
|
||||
export function categoryToTier(
|
||||
category: DeniedCategory | null,
|
||||
): "read" | "click" | "full" {
|
||||
if (category === "browser" || category === "trading") return "read";
|
||||
if (category === "terminal") return "click";
|
||||
return "full";
|
||||
}
|
||||
|
||||
// ─── Bundle-ID deny sets (macOS) ─────────────────────────────────────────
|
||||
|
||||
const BROWSER_BUNDLE_IDS: ReadonlySet<string> = new Set([
|
||||
// Apple
|
||||
"com.apple.Safari",
|
||||
"com.apple.SafariTechnologyPreview",
|
||||
// Google
|
||||
"com.google.Chrome",
|
||||
"com.google.Chrome.beta",
|
||||
"com.google.Chrome.dev",
|
||||
"com.google.Chrome.canary",
|
||||
// Microsoft
|
||||
"com.microsoft.edgemac",
|
||||
"com.microsoft.edgemac.Beta",
|
||||
"com.microsoft.edgemac.Dev",
|
||||
"com.microsoft.edgemac.Canary",
|
||||
// Mozilla
|
||||
"org.mozilla.firefox",
|
||||
"org.mozilla.firefoxdeveloperedition",
|
||||
"org.mozilla.nightly",
|
||||
// Chromium-based
|
||||
"org.chromium.Chromium",
|
||||
"com.brave.Browser",
|
||||
"com.brave.Browser.beta",
|
||||
"com.brave.Browser.nightly",
|
||||
"com.operasoftware.Opera",
|
||||
"com.operasoftware.OperaGX",
|
||||
"com.operasoftware.OperaDeveloper",
|
||||
"com.vivaldi.Vivaldi",
|
||||
// The Browser Company
|
||||
"company.thebrowser.Browser", // Arc
|
||||
"company.thebrowser.dia", // Dia (agentic)
|
||||
// Privacy-focused
|
||||
"org.torproject.torbrowser",
|
||||
"com.duckduckgo.macos.browser",
|
||||
"ru.yandex.desktop.yandex-browser",
|
||||
// Agentic / AI browsers — newer entrants with LLM integrations
|
||||
"ai.perplexity.comet",
|
||||
"com.sigmaos.sigmaos.macos", // SigmaOS
|
||||
// Webkit-based misc
|
||||
"com.kagi.kagimacOS", // Orion
|
||||
]);
|
||||
|
||||
/**
|
||||
* Terminals + IDEs with integrated terminals. Supersets
|
||||
* `SHELL_ACCESS_BUNDLE_IDS` from sentinelApps.ts — terminals proceed to the
|
||||
* approval dialog at tier "click", and the sentinel warning renders
|
||||
* alongside the tier badge.
|
||||
*/
|
||||
const TERMINAL_BUNDLE_IDS: ReadonlySet<string> = new Set([
|
||||
// Dedicated terminals
|
||||
"com.apple.Terminal",
|
||||
"com.googlecode.iterm2",
|
||||
"dev.warp.Warp-Stable",
|
||||
"dev.warp.Warp-Beta",
|
||||
"com.github.wez.wezterm",
|
||||
"org.alacritty",
|
||||
"io.alacritty", // pre-v0.11.0 (renamed 2022-07) — kept for legacy installs
|
||||
"net.kovidgoyal.kitty",
|
||||
"co.zeit.hyper",
|
||||
"com.mitchellh.ghostty",
|
||||
"org.tabby",
|
||||
"com.termius-dmg.mac", // Termius
|
||||
// IDEs with integrated terminals — we can't distinguish "type in the
|
||||
// editor" from "type in the integrated terminal" via screenshot+click.
|
||||
// VS Code family
|
||||
"com.microsoft.VSCode",
|
||||
"com.microsoft.VSCodeInsiders",
|
||||
"com.vscodium", // VSCodium
|
||||
"com.todesktop.230313mzl4w4u92", // Cursor
|
||||
"com.exafunction.windsurf", // Windsurf / Codeium
|
||||
"dev.zed.Zed",
|
||||
"dev.zed.Zed-Preview",
|
||||
// JetBrains family (all have integrated terminals)
|
||||
"com.jetbrains.intellij",
|
||||
"com.jetbrains.intellij.ce",
|
||||
"com.jetbrains.pycharm",
|
||||
"com.jetbrains.pycharm.ce",
|
||||
"com.jetbrains.WebStorm",
|
||||
"com.jetbrains.CLion",
|
||||
"com.jetbrains.goland",
|
||||
"com.jetbrains.rubymine",
|
||||
"com.jetbrains.PhpStorm",
|
||||
"com.jetbrains.datagrip",
|
||||
"com.jetbrains.rider",
|
||||
"com.jetbrains.AppCode",
|
||||
"com.jetbrains.rustrover",
|
||||
"com.jetbrains.fleet",
|
||||
"com.google.android.studio", // Android Studio (JetBrains-based)
|
||||
// Other IDEs
|
||||
"com.axosoft.gitkraken", // GitKraken has an integrated terminal panel. Also keeps the "kraken" trading-substring from miscategorizing it — bundle-ID wins.
|
||||
"com.sublimetext.4",
|
||||
"com.sublimetext.3",
|
||||
"org.vim.MacVim",
|
||||
"com.neovim.neovim",
|
||||
"org.gnu.Emacs",
|
||||
// Xcode's previous carve-out (full tier for Interface Builder / simulator)
|
||||
// was reversed — at tier "click" IB and simulator taps still work (both are
|
||||
// plain clicks) while the integrated terminal is blocked from keyboard input.
|
||||
"com.apple.dt.Xcode",
|
||||
"org.eclipse.platform.ide",
|
||||
"org.netbeans.ide",
|
||||
"com.microsoft.visual-studio", // Visual Studio for Mac
|
||||
// AppleScript/automation execution surfaces — same threat as terminals:
|
||||
// type(script) → key("cmd+r") runs arbitrary code. Added after #28011
|
||||
// removed the osascript MCP server, making CU the only tool-call route
|
||||
// to AppleScript.
|
||||
"com.apple.ScriptEditor2",
|
||||
"com.apple.Automator",
|
||||
"com.apple.shortcuts",
|
||||
]);
|
||||
|
||||
/**
|
||||
* Trading / crypto platforms — granted at tier `"read"` so the agent can see
|
||||
* balances and prices but can't click into an order, transfer, or IB chat.
|
||||
* Bundle IDs populated from Homebrew cask `uninstall.quit` stanzas as they're
|
||||
* verified; the name-substring fallback below is the primary check. Bloomberg
|
||||
* Terminal has no native macOS build per their FAQ (web/Citrix only).
|
||||
*
|
||||
* Budgeting/accounting apps (Quicken, YNAB, QuickBooks, etc.) are NOT listed
|
||||
* here — they default to tier `"full"`. The risk model for brokerage/crypto
|
||||
* (a stray click can execute a trade) doesn't apply to budgeting apps; the
|
||||
* Cowork system prompt carries the soft instruction to never execute trades
|
||||
* or transfer money on the user's behalf.
|
||||
*/
|
||||
const TRADING_BUNDLE_IDS: ReadonlySet<string> = new Set([
|
||||
// Verified via Homebrew quit/zap stanzas + mdls + electron-builder source.
|
||||
// Trading
|
||||
"com.webull.desktop.v1", // Webull (direct download, Qt)
|
||||
"com.webull.trade.mac.v1", // Webull (Mac App Store)
|
||||
"com.tastytrade.desktop",
|
||||
"com.tradingview.tradingviewapp.desktop",
|
||||
"com.fidelity.activetrader", // Fidelity Trader+ (new)
|
||||
"com.fmr.activetrader", // Fidelity Active Trader Pro (legacy)
|
||||
// Interactive Brokers TWS — install4j wrapper; Homebrew quit stanza is
|
||||
// authoritative for this exact value but install4j IDs can drift across
|
||||
// major versions — name-substring "trader workstation" is the fallback.
|
||||
"com.install4j.5889-6375-8446-2021",
|
||||
// Crypto
|
||||
"com.binance.BinanceDesktop",
|
||||
"com.electron.exodus",
|
||||
// Electrum uses PyInstaller with bundle_identifier=None → defaults to
|
||||
// org.pythonmac.unspecified.<AppName>. Confirmed in spesmilo/electrum
|
||||
// source + Homebrew zap. IntuneBrew's "org.electrum.electrum" is a fork.
|
||||
"org.pythonmac.unspecified.Electrum",
|
||||
"com.ledger.live",
|
||||
"io.trezor.TrezorSuite",
|
||||
// No native macOS app (name-substring only): Schwab, E*TRADE, TradeStation,
|
||||
// Robinhood, NinjaTrader, Coinbase, Kraken, Bloomberg. thinkorswim
|
||||
// install4j ID drifts per-install — substring safer.
|
||||
]);
|
||||
|
||||
// ─── Policy-deny (not a tier — cannot be granted at all) ─────────────────
|
||||
//
|
||||
// Streaming / ebook / music apps and a handful of publisher apps. These
|
||||
// are auto-denied before the approval dialog — no tier can be granted.
|
||||
// Rationale is copyright / content-control (the agent has no legitimate
|
||||
// need to screenshot Netflix or click Play on Spotify).
|
||||
//
|
||||
// Sourced from the ACP CU-apps blocklist xlsx ("Full block" tab). See
|
||||
// /tmp/extract_cu_blocklist.py for the extraction script.
|
||||
|
||||
const POLICY_DENIED_BUNDLE_IDS: ReadonlySet<string> = new Set([
|
||||
// Verified via Homebrew quit/zap + mdls /System/Applications + IntuneBrew.
|
||||
// Apple built-ins
|
||||
"com.apple.TV",
|
||||
"com.apple.Music",
|
||||
"com.apple.iBooksX",
|
||||
"com.apple.podcasts",
|
||||
// Music
|
||||
"com.spotify.client",
|
||||
"com.amazon.music",
|
||||
"com.tidal.desktop",
|
||||
"com.deezer.deezer-desktop",
|
||||
"com.pandora.desktop",
|
||||
"com.electron.pocket-casts", // direct-download Electron wrapper
|
||||
"au.com.shiftyjelly.PocketCasts", // Mac App Store
|
||||
// Video
|
||||
"tv.plex.desktop",
|
||||
"tv.plex.htpc",
|
||||
"tv.plex.plexamp",
|
||||
"com.amazon.aiv.AIVApp", // Prime Video (iOS-on-Apple-Silicon)
|
||||
// Ebooks
|
||||
"net.kovidgoyal.calibre",
|
||||
"com.amazon.Kindle", // legacy desktop, discontinued
|
||||
"com.amazon.Lassen", // current Mac App Store (iOS-on-Mac)
|
||||
"com.kobo.desktop.Kobo",
|
||||
// No native macOS app (name-substring only): Netflix, Disney+, Hulu,
|
||||
// HBO Max, Peacock, Paramount+, YouTube, Crunchyroll, Tubi, Vudu,
|
||||
// Audible, Reddit, NYTimes. Their iOS apps don't opt into iPad-on-Mac.
|
||||
]);
|
||||
|
||||
const POLICY_DENIED_NAME_SUBSTRINGS: readonly string[] = [
|
||||
// Video streaming
|
||||
"netflix",
|
||||
"disney+",
|
||||
"hulu",
|
||||
"prime video",
|
||||
"apple tv",
|
||||
"peacock",
|
||||
"paramount+",
|
||||
// "plex" is too generic — would match "Perplexity". Covered by
|
||||
// tv.plex.* bundle IDs on macOS.
|
||||
"tubi",
|
||||
"crunchyroll",
|
||||
"vudu",
|
||||
// E-readers / audiobooks
|
||||
"kindle",
|
||||
"apple books",
|
||||
"kobo",
|
||||
"play books",
|
||||
"calibre",
|
||||
"libby",
|
||||
"readium",
|
||||
"audible",
|
||||
"libro.fm",
|
||||
"speechify",
|
||||
// Music
|
||||
"spotify",
|
||||
"apple music",
|
||||
"amazon music",
|
||||
"youtube music",
|
||||
"tidal",
|
||||
"deezer",
|
||||
"pandora",
|
||||
"pocket casts",
|
||||
// Publisher / social apps (from the same blocklist tab)
|
||||
"naver",
|
||||
"reddit",
|
||||
"sony music",
|
||||
"vegas pro",
|
||||
"pitchfork",
|
||||
"economist",
|
||||
"nytimes",
|
||||
// Skipped (too generic for substring matching — need bundle ID):
|
||||
// HBO Max / Max, YouTube (non-Music), Nook, Sony Catalyst, Wired
|
||||
];
|
||||
|
||||
/**
|
||||
* Policy-level auto-deny. Unlike `userDeniedBundleIds` (per-user Settings
|
||||
* page), this is baked into the build. `buildAccessRequest` strips these
|
||||
* before the approval dialog with "blocked by policy" guidance; the agent
|
||||
* is told to not retry.
|
||||
*/
|
||||
export function isPolicyDenied(
|
||||
bundleId: string | undefined,
|
||||
displayName: string,
|
||||
): boolean {
|
||||
if (bundleId && POLICY_DENIED_BUNDLE_IDS.has(bundleId)) return true;
|
||||
const lower = displayName.toLowerCase();
|
||||
for (const sub of POLICY_DENIED_NAME_SUBSTRINGS) {
|
||||
if (lower.includes(sub)) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export function getDeniedCategory(bundleId: string): DeniedCategory | null {
|
||||
if (BROWSER_BUNDLE_IDS.has(bundleId)) return "browser";
|
||||
if (TERMINAL_BUNDLE_IDS.has(bundleId)) return "terminal";
|
||||
if (TRADING_BUNDLE_IDS.has(bundleId)) return "trading";
|
||||
return null;
|
||||
}
|
||||
|
||||
// ─── Display-name fallback (cross-platform) ──────────────────────────────
|
||||
|
||||
/**
|
||||
* Lowercase substrings checked against the requested display name. Catches:
|
||||
* - Unresolved requests (app not installed, Spotlight miss)
|
||||
* - Future Windows/Linux support where bundleId is meaningless
|
||||
*
|
||||
* Matched via `.includes()` on `name.toLowerCase()`. Entries are ordered
|
||||
* by specificity (more-specific first is irrelevant since we return on
|
||||
* first match, but groupings are by category for readability).
|
||||
*/
|
||||
const BROWSER_NAME_SUBSTRINGS: readonly string[] = [
|
||||
"safari",
|
||||
"chrome",
|
||||
"firefox",
|
||||
"microsoft edge",
|
||||
"brave",
|
||||
"opera",
|
||||
"vivaldi",
|
||||
"chromium",
|
||||
// Arc/Dia: the canonical display name is just "Arc"/"Dia" — too short for
|
||||
// substring matching (false-positives: "Arcade", "Diagram"). Covered by
|
||||
// bundle ID on macOS. The "... browser" entries below catch natural-language
|
||||
// phrasings ("the arc browser") but NOT the canonical short name.
|
||||
"arc browser",
|
||||
"tor browser",
|
||||
"duckduckgo",
|
||||
"yandex",
|
||||
"orion browser",
|
||||
// Agentic / AI browsers
|
||||
"comet", // Perplexity's browser — "Comet" substring risks false positives
|
||||
// but leaving for now; "comet" in an app name is rare
|
||||
"sigmaos",
|
||||
"dia browser",
|
||||
];
|
||||
|
||||
const TERMINAL_NAME_SUBSTRINGS: readonly string[] = [
|
||||
// macOS / cross-platform terminals
|
||||
"terminal", // catches Terminal, Windows Terminal (NOT iTerm — separate entry)
|
||||
"iterm",
|
||||
"wezterm",
|
||||
"alacritty",
|
||||
"kitty",
|
||||
"ghostty",
|
||||
"tabby",
|
||||
"termius",
|
||||
// AppleScript runners — see bundle-ID comment above. "shortcuts" is too
|
||||
// generic for substring matching (many apps have "shortcuts" in the name);
|
||||
// covered by bundle ID only, like warp/hyper.
|
||||
"script editor",
|
||||
"automator",
|
||||
// NOTE: "warp" and "hyper" are too generic for substring matching —
|
||||
// they'd false-positive on "Warpaint" or "Hyperion". Covered by bundle ID
|
||||
// (dev.warp.Warp-Stable, co.zeit.hyper) for macOS; Windows exe-name
|
||||
// matching can be added when Windows CU ships.
|
||||
// Windows shells (activate when the darwin gate lifts)
|
||||
"powershell",
|
||||
"cmd.exe",
|
||||
"command prompt",
|
||||
"git bash",
|
||||
"conemu",
|
||||
"cmder",
|
||||
// IDEs (VS Code family)
|
||||
"visual studio code",
|
||||
"visual studio", // catches VS for Mac + Windows
|
||||
"vscode",
|
||||
"vs code",
|
||||
"vscodium",
|
||||
"cursor", // Cursor IDE — "cursor" is generic but IDE is the only common app
|
||||
"windsurf",
|
||||
// Zed: display name is just "Zed" — too short for substring matching
|
||||
// (false-positives). Covered by bundle ID (dev.zed.Zed) on macOS.
|
||||
// IDEs (JetBrains family)
|
||||
"intellij",
|
||||
"pycharm",
|
||||
"webstorm",
|
||||
"clion",
|
||||
"goland",
|
||||
"rubymine",
|
||||
"phpstorm",
|
||||
"datagrip",
|
||||
"rider",
|
||||
"appcode",
|
||||
"rustrover",
|
||||
"fleet",
|
||||
"android studio",
|
||||
// Other IDEs
|
||||
"sublime text",
|
||||
"macvim",
|
||||
"neovim",
|
||||
"emacs",
|
||||
"xcode",
|
||||
"eclipse",
|
||||
"netbeans",
|
||||
];
|
||||
|
||||
const TRADING_NAME_SUBSTRINGS: readonly string[] = [
|
||||
// Trading — brokerage apps. Sourced from the ACP CU-apps blocklist xlsx
|
||||
// ("Read Only" tab). Name-substring safe for proper nouns below; generic
|
||||
// names (IG, Delta, HTX) are skipped and need bundle-ID matching once
|
||||
// verified.
|
||||
"bloomberg",
|
||||
"ameritrade",
|
||||
"thinkorswim",
|
||||
"schwab",
|
||||
"fidelity",
|
||||
"e*trade",
|
||||
"interactive brokers",
|
||||
"trader workstation", // Interactive Brokers TWS
|
||||
"tradestation",
|
||||
"webull",
|
||||
"robinhood",
|
||||
"tastytrade",
|
||||
"ninjatrader",
|
||||
"tradingview",
|
||||
"moomoo",
|
||||
"tradezero",
|
||||
"prorealtime",
|
||||
"plus500",
|
||||
"saxotrader",
|
||||
"oanda",
|
||||
"metatrader",
|
||||
"forex.com",
|
||||
"avaoptions",
|
||||
"ctrader",
|
||||
"jforex",
|
||||
"iq option",
|
||||
"olymp trade",
|
||||
"binomo",
|
||||
"pocket option",
|
||||
"raceoption",
|
||||
"expertoption",
|
||||
"quotex",
|
||||
"naga",
|
||||
"morgan stanley",
|
||||
"ubs neo",
|
||||
"eikon", // Thomson Reuters / LSEG Workspace
|
||||
// Crypto — exchanges, wallets, portfolio trackers
|
||||
"coinbase",
|
||||
"kraken",
|
||||
"binance",
|
||||
"okx",
|
||||
"bybit",
|
||||
// "gate.io" is too generic — the ".io" TLD suffix is common in app names
|
||||
// (e.g., "Draw.io"). Needs bundle-ID matching once verified.
|
||||
"phemex",
|
||||
"stormgain",
|
||||
"crypto.com",
|
||||
// "exodus" is too generic — it's a common noun and would match unrelated
|
||||
// apps/games. Needs bundle-ID matching once verified.
|
||||
"electrum",
|
||||
"ledger live",
|
||||
"trezor",
|
||||
"guarda",
|
||||
"atomic wallet",
|
||||
"bitpay",
|
||||
"bisq",
|
||||
"koinly",
|
||||
"cointracker",
|
||||
"blockfi",
|
||||
"stripe cli",
|
||||
// Crypto games / metaverse (same trade-execution risk model)
|
||||
"decentraland",
|
||||
"axie infinity",
|
||||
"gods unchained",
|
||||
];
|
||||
|
||||
/**
|
||||
* Display-name substring match. Called when bundle-ID resolution returned
|
||||
* nothing (`resolved === undefined`) or when no bundle-ID deny-list entry
|
||||
* matched. Returns the category for the first matching substring, or null.
|
||||
*
|
||||
* Case-insensitive, substring — so `"Google Chrome"`, `"chrome"`, and
|
||||
* `"Chrome Canary"` all match the `"chrome"` entry.
|
||||
*/
|
||||
export function getDeniedCategoryByDisplayName(
|
||||
name: string,
|
||||
): DeniedCategory | null {
|
||||
const lower = name.toLowerCase();
|
||||
// Trading first — proper-noun-only set, most specific. "Bloomberg Terminal"
|
||||
// contains "terminal" and would miscategorize if TERMINAL_NAME_SUBSTRINGS
|
||||
// ran first.
|
||||
for (const sub of TRADING_NAME_SUBSTRINGS) {
|
||||
if (lower.includes(sub)) return "trading";
|
||||
}
|
||||
for (const sub of BROWSER_NAME_SUBSTRINGS) {
|
||||
if (lower.includes(sub)) return "browser";
|
||||
}
|
||||
for (const sub of TERMINAL_NAME_SUBSTRINGS) {
|
||||
if (lower.includes(sub)) return "terminal";
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* Combined check — bundle ID first (exact, fast), then display-name
|
||||
* fallback. This is the function tool-call handlers should use.
|
||||
*
|
||||
* `bundleId` may be undefined (unresolved request — model asked for an app
|
||||
* that isn't installed or Spotlight didn't find). In that case only the
|
||||
* display-name check runs.
|
||||
*/
|
||||
export function getDeniedCategoryForApp(
|
||||
bundleId: string | undefined,
|
||||
displayName: string,
|
||||
): DeniedCategory | null {
|
||||
if (bundleId) {
|
||||
const byId = getDeniedCategory(bundleId);
|
||||
if (byId) return byId;
|
||||
}
|
||||
return getDeniedCategoryByDisplayName(displayName);
|
||||
}
|
||||
|
||||
/**
|
||||
* Default tier for an app at grant time. Wraps `getDeniedCategoryForApp` +
|
||||
* `categoryToTier`. Browsers → `"read"`, terminals/IDEs → `"click"`,
|
||||
* everything else → `"full"`.
|
||||
*
|
||||
* Called by `buildAccessRequest` to populate `ResolvedAppRequest.proposedTier`
|
||||
* before the approval dialog shows.
|
||||
*/
|
||||
export function getDefaultTierForApp(
|
||||
bundleId: string | undefined,
|
||||
displayName: string,
|
||||
): "read" | "click" | "full" {
|
||||
return categoryToTier(getDeniedCategoryForApp(bundleId, displayName));
|
||||
}
|
||||
|
||||
export const _test = {
|
||||
BROWSER_BUNDLE_IDS,
|
||||
TERMINAL_BUNDLE_IDS,
|
||||
TRADING_BUNDLE_IDS,
|
||||
POLICY_DENIED_BUNDLE_IDS,
|
||||
BROWSER_NAME_SUBSTRINGS,
|
||||
TERMINAL_NAME_SUBSTRINGS,
|
||||
TRADING_NAME_SUBSTRINGS,
|
||||
POLICY_DENIED_NAME_SUBSTRINGS,
|
||||
};
|
||||
111
packages/@ant/computer-use-mcp/src/executor.ts
Normal file
111
packages/@ant/computer-use-mcp/src/executor.ts
Normal file
@@ -0,0 +1,111 @@
|
||||
export interface DisplayGeometry {
|
||||
displayId: number
|
||||
width: number
|
||||
height: number
|
||||
scaleFactor: number
|
||||
originX: number
|
||||
originY: number
|
||||
}
|
||||
|
||||
export interface ScreenshotResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
displayWidth: number
|
||||
displayHeight: number
|
||||
originX: number
|
||||
originY: number
|
||||
displayId?: number
|
||||
}
|
||||
|
||||
export interface FrontmostApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
}
|
||||
|
||||
export interface InstalledApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
path: string
|
||||
iconDataUrl?: string
|
||||
}
|
||||
|
||||
export interface RunningApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
pid?: number
|
||||
}
|
||||
|
||||
export interface ResolvePrepareCaptureResult extends ScreenshotResult {
|
||||
hidden: string[]
|
||||
activated?: string
|
||||
displayId: number
|
||||
}
|
||||
|
||||
export interface ComputerExecutorCapabilities {
|
||||
screenshotFiltering: 'native' | 'none'
|
||||
platform: 'darwin' | 'win32'
|
||||
hostBundleId: string
|
||||
}
|
||||
|
||||
export interface ComputerExecutor {
|
||||
capabilities: ComputerExecutorCapabilities
|
||||
prepareForAction(
|
||||
allowlistBundleIds: string[],
|
||||
displayId?: number,
|
||||
): Promise<string[]>
|
||||
previewHideSet(
|
||||
allowlistBundleIds: string[],
|
||||
displayId?: number,
|
||||
): Promise<Array<{ bundleId: string; displayName: string }>>
|
||||
getDisplaySize(displayId?: number): Promise<DisplayGeometry>
|
||||
listDisplays(): Promise<DisplayGeometry[]>
|
||||
findWindowDisplays(
|
||||
bundleIds: string[],
|
||||
): Promise<Array<{ bundleId: string; displayIds: number[] }>>
|
||||
resolvePrepareCapture(opts: {
|
||||
allowedBundleIds: string[]
|
||||
preferredDisplayId?: number
|
||||
autoResolve: boolean
|
||||
doHide?: boolean
|
||||
}): Promise<ResolvePrepareCaptureResult>
|
||||
screenshot(opts: {
|
||||
allowedBundleIds: string[]
|
||||
displayId?: number
|
||||
}): Promise<ScreenshotResult>
|
||||
zoom(
|
||||
regionLogical: { x: number; y: number; w: number; h: number },
|
||||
allowedBundleIds: string[],
|
||||
displayId?: number,
|
||||
): Promise<{ base64: string; width: number; height: number }>
|
||||
key(keySequence: string, repeat?: number): Promise<void>
|
||||
holdKey(keyNames: string[], durationMs: number): Promise<void>
|
||||
type(text: string, opts: { viaClipboard: boolean }): Promise<void>
|
||||
readClipboard(): Promise<string>
|
||||
writeClipboard(text: string): Promise<void>
|
||||
moveMouse(x: number, y: number): Promise<void>
|
||||
click(
|
||||
x: number,
|
||||
y: number,
|
||||
button: 'left' | 'right' | 'middle',
|
||||
count: 1 | 2 | 3,
|
||||
modifiers?: string[],
|
||||
): Promise<void>
|
||||
mouseDown(): Promise<void>
|
||||
mouseUp(): Promise<void>
|
||||
getCursorPosition(): Promise<{ x: number; y: number }>
|
||||
drag(
|
||||
from: { x: number; y: number } | undefined,
|
||||
to: { x: number; y: number },
|
||||
): Promise<void>
|
||||
scroll(x: number, y: number, dx: number, dy: number): Promise<void>
|
||||
getFrontmostApp(): Promise<FrontmostApp | null>
|
||||
appUnderPoint(
|
||||
x: number,
|
||||
y: number,
|
||||
): Promise<{ bundleId: string; displayName: string } | null>
|
||||
listInstalledApps(): Promise<InstalledApp[]>
|
||||
getAppIcon(path: string): Promise<string | undefined>
|
||||
listRunningApps(): Promise<RunningApp[]>
|
||||
openApp(bundleId: string): Promise<void>
|
||||
}
|
||||
108
packages/@ant/computer-use-mcp/src/imageResize.ts
Normal file
108
packages/@ant/computer-use-mcp/src/imageResize.ts
Normal file
@@ -0,0 +1,108 @@
|
||||
/**
|
||||
* Port of the API's image transcoder target-size algorithm. Pre-sizing
|
||||
* screenshots to this function's output means the API's early-return fires
|
||||
* (tokens ≤ max) and the image is NOT resized server-side — so the model
|
||||
* sees exactly the dimensions in `ScreenshotResult.width/height` and
|
||||
* `scaleCoord` stays coherent.
|
||||
*
|
||||
* Rust reference: api/api/image_transcoder/rust_transcoder/src/utils/resize.rs
|
||||
* Sibling TS port: apps/claude-browser-use/src/utils/imageResize.ts (identical
|
||||
* algorithm, lives in the Chrome extension tree — not a shared package).
|
||||
*
|
||||
* See COORDINATES.md for why this matters for click accuracy.
|
||||
*/
|
||||
|
||||
export interface ResizeParams {
|
||||
pxPerToken: number;
|
||||
maxTargetPx: number;
|
||||
maxTargetTokens: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Production defaults — match `resize.rs:160-164` and Chrome's
|
||||
* `CDPService.ts:638-642`. Vision encoder uses 28px tiles; 1568 is both
|
||||
* the long-edge cap (56 tiles) AND the token budget.
|
||||
*/
|
||||
export const API_RESIZE_PARAMS: ResizeParams = {
|
||||
pxPerToken: 28,
|
||||
maxTargetPx: 1568,
|
||||
maxTargetTokens: 1568,
|
||||
};
|
||||
|
||||
/** ceil(px / pxPerToken). Matches resize.rs:74-76 (which uses integer ceil-div). */
|
||||
export function nTokensForPx(px: number, pxPerToken: number): number {
|
||||
return Math.floor((px - 1) / pxPerToken) + 1;
|
||||
}
|
||||
|
||||
function nTokensForImg(
|
||||
width: number,
|
||||
height: number,
|
||||
pxPerToken: number,
|
||||
): number {
|
||||
return nTokensForPx(width, pxPerToken) * nTokensForPx(height, pxPerToken);
|
||||
}
|
||||
|
||||
/**
|
||||
* Binary-search along the width dimension for the largest image that:
|
||||
* - preserves the input aspect ratio
|
||||
* - has long edge ≤ maxTargetPx
|
||||
* - has ceil(w/pxPerToken) × ceil(h/pxPerToken) ≤ maxTargetTokens
|
||||
*
|
||||
* Returns [width, height]. No-op if input already satisfies all three.
|
||||
*
|
||||
* The long-edge constraint alone (what we used to use) is insufficient on
|
||||
* squarer-than-16:9 displays: 1568×1014 (MBP 16" AR) is 56×37 = 2072 tokens,
|
||||
* over budget, and gets server-resized to 1372×887 — model then clicks in
|
||||
* 1372-space but scaleCoord assumed 1568-space → ~14% coord error.
|
||||
*
|
||||
* Matches resize.rs:91-155 exactly (verified against its test vectors).
|
||||
*/
|
||||
export function targetImageSize(
|
||||
width: number,
|
||||
height: number,
|
||||
params: ResizeParams,
|
||||
): [number, number] {
|
||||
const { pxPerToken, maxTargetPx, maxTargetTokens } = params;
|
||||
|
||||
if (
|
||||
width <= maxTargetPx &&
|
||||
height <= maxTargetPx &&
|
||||
nTokensForImg(width, height, pxPerToken) <= maxTargetTokens
|
||||
) {
|
||||
return [width, height];
|
||||
}
|
||||
|
||||
// Normalize to landscape for the search; transpose result back.
|
||||
if (height > width) {
|
||||
const [w, h] = targetImageSize(height, width, params);
|
||||
return [h, w];
|
||||
}
|
||||
|
||||
const aspectRatio = width / height;
|
||||
|
||||
// Loop invariant: lowerBoundWidth is always valid, upperBoundWidth is
|
||||
// always invalid. ~12 iterations for a 4000px image.
|
||||
let upperBoundWidth = width;
|
||||
let lowerBoundWidth = 1;
|
||||
|
||||
for (;;) {
|
||||
if (lowerBoundWidth + 1 === upperBoundWidth) {
|
||||
return [
|
||||
lowerBoundWidth,
|
||||
Math.max(Math.round(lowerBoundWidth / aspectRatio), 1),
|
||||
];
|
||||
}
|
||||
|
||||
const middleWidth = Math.floor((lowerBoundWidth + upperBoundWidth) / 2);
|
||||
const middleHeight = Math.max(Math.round(middleWidth / aspectRatio), 1);
|
||||
|
||||
if (
|
||||
middleWidth <= maxTargetPx &&
|
||||
nTokensForImg(middleWidth, middleHeight, pxPerToken) <= maxTargetTokens
|
||||
) {
|
||||
lowerBoundWidth = middleWidth;
|
||||
} else {
|
||||
upperBoundWidth = middleWidth;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,163 +1,69 @@
|
||||
/**
|
||||
* @ant/computer-use-mcp — Stub 实现
|
||||
*
|
||||
* 提供类型安全的 stub,所有函数返回合理的默认值。
|
||||
* 在 feature('CHICAGO_MCP') = false 时不会被实际调用,
|
||||
* 但确保 import 不报错且类型正确。
|
||||
*/
|
||||
|
||||
import type {
|
||||
ComputerUseHostAdapter,
|
||||
CoordinateMode,
|
||||
GrantFlags,
|
||||
Logger,
|
||||
} from './types'
|
||||
|
||||
// Re-export types from types.ts
|
||||
export type { CoordinateMode, Logger } from './types'
|
||||
export type {
|
||||
ComputerUseConfig,
|
||||
ComputerExecutor,
|
||||
DisplayGeometry,
|
||||
FrontmostApp,
|
||||
InstalledApp,
|
||||
ResolvePrepareCaptureResult,
|
||||
RunningApp,
|
||||
ScreenshotResult,
|
||||
} from "./executor.js";
|
||||
|
||||
export type {
|
||||
AppGrant,
|
||||
CuAppPermTier,
|
||||
ComputerUseHostAdapter,
|
||||
ComputerUseOverrides,
|
||||
ComputerUseSessionContext,
|
||||
CoordinateMode,
|
||||
CuGrantFlags,
|
||||
CuPermissionRequest,
|
||||
CuPermissionResponse,
|
||||
CuSubGates,
|
||||
} from './types'
|
||||
export { DEFAULT_GRANT_FLAGS } from './types'
|
||||
CuTeachPermissionRequest,
|
||||
Logger,
|
||||
ResolvedAppRequest,
|
||||
ScreenshotDims,
|
||||
TeachStepRequest,
|
||||
TeachStepResult,
|
||||
} from "./types.js";
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types (defined here for callers that import from the main entry)
|
||||
// ---------------------------------------------------------------------------
|
||||
export { DEFAULT_GRANT_FLAGS } from "./types.js";
|
||||
|
||||
export interface DisplayGeometry {
|
||||
width: number
|
||||
height: number
|
||||
displayId?: number
|
||||
originX?: number
|
||||
originY?: number
|
||||
}
|
||||
export {
|
||||
SENTINEL_BUNDLE_IDS,
|
||||
getSentinelCategory,
|
||||
} from "./sentinelApps.js";
|
||||
export type { SentinelCategory } from "./sentinelApps.js";
|
||||
|
||||
export interface FrontmostApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
}
|
||||
export {
|
||||
categoryToTier,
|
||||
getDefaultTierForApp,
|
||||
getDeniedCategory,
|
||||
getDeniedCategoryByDisplayName,
|
||||
getDeniedCategoryForApp,
|
||||
isPolicyDenied,
|
||||
} from "./deniedApps.js";
|
||||
export type { DeniedCategory } from "./deniedApps.js";
|
||||
|
||||
export interface InstalledApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
path: string
|
||||
}
|
||||
export { isSystemKeyCombo, normalizeKeySequence } from "./keyBlocklist.js";
|
||||
|
||||
export interface RunningApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
}
|
||||
export { ALL_SUB_GATES_OFF, ALL_SUB_GATES_ON } from "./subGates.js";
|
||||
|
||||
export interface ScreenshotResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
export { API_RESIZE_PARAMS, targetImageSize } from "./imageResize.js";
|
||||
export type { ResizeParams } from "./imageResize.js";
|
||||
|
||||
export type ResolvePrepareCaptureResult = ScreenshotResult
|
||||
export { defersLockAcquire, handleToolCall } from "./toolCalls.js";
|
||||
export type {
|
||||
CuCallTelemetry,
|
||||
CuCallToolResult,
|
||||
CuErrorKind,
|
||||
} from "./toolCalls.js";
|
||||
|
||||
export interface ScreenshotDims {
|
||||
width: number
|
||||
height: number
|
||||
displayWidth: number
|
||||
displayHeight: number
|
||||
displayId: number
|
||||
originX: number
|
||||
originY: number
|
||||
}
|
||||
export { bindSessionContext, createComputerUseMcpServer } from "./mcpServer.js";
|
||||
export { buildComputerUseTools } from "./tools.js";
|
||||
|
||||
export interface CuCallToolResultContent {
|
||||
type: 'image' | 'text'
|
||||
data?: string
|
||||
mimeType?: string
|
||||
text?: string
|
||||
}
|
||||
|
||||
export interface CuCallToolResult {
|
||||
content: CuCallToolResultContent[]
|
||||
telemetry: {
|
||||
error_kind?: string
|
||||
[key: string]: unknown
|
||||
}
|
||||
}
|
||||
|
||||
export type ComputerUseSessionContext = Record<string, unknown>
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// API_RESIZE_PARAMS — 默认的截图缩放参数
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const API_RESIZE_PARAMS = {
|
||||
maxWidth: 1280,
|
||||
maxHeight: 800,
|
||||
maxPixels: 1280 * 800,
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ComputerExecutor — stub class
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class ComputerExecutor {
|
||||
capabilities: Record<string, boolean> = {}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Functions — 返回合理默认值的 stub
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* 计算目标截图尺寸。
|
||||
* 在物理宽高和 API 限制之间取最优尺寸。
|
||||
*/
|
||||
export function targetImageSize(
|
||||
physW: number,
|
||||
physH: number,
|
||||
_params?: typeof API_RESIZE_PARAMS,
|
||||
): [number, number] {
|
||||
const maxW = _params?.maxWidth ?? 1280
|
||||
const maxH = _params?.maxHeight ?? 800
|
||||
const scale = Math.min(1, maxW / physW, maxH / physH)
|
||||
return [Math.round(physW * scale), Math.round(physH * scale)]
|
||||
}
|
||||
|
||||
/**
|
||||
* 绑定会话上下文,返回工具调度函数。
|
||||
* Stub 返回一个始终返回空结果的调度器。
|
||||
*/
|
||||
export function bindSessionContext(
|
||||
_adapter: ComputerUseHostAdapter,
|
||||
_coordinateMode: CoordinateMode,
|
||||
_ctx: ComputerUseSessionContext,
|
||||
): (name: string, args: unknown) => Promise<CuCallToolResult> {
|
||||
return async (_name: string, _args: unknown) => ({
|
||||
content: [],
|
||||
telemetry: {},
|
||||
})
|
||||
}
|
||||
|
||||
/**
|
||||
* 构建 Computer Use 工具定义列表。
|
||||
* Stub 返回空数组(无工具)。
|
||||
*/
|
||||
export function buildComputerUseTools(
|
||||
_capabilities?: Record<string, boolean>,
|
||||
_coordinateMode?: CoordinateMode,
|
||||
_installedAppNames?: string[],
|
||||
): Array<{ name: string; description: string; inputSchema: Record<string, unknown> }> {
|
||||
return []
|
||||
}
|
||||
|
||||
/**
|
||||
* 创建 Computer Use MCP server。
|
||||
* Stub 返回 null(服务未启用)。
|
||||
*/
|
||||
export function createComputerUseMcpServer(
|
||||
_adapter?: ComputerUseHostAdapter,
|
||||
_coordinateMode?: CoordinateMode,
|
||||
): null {
|
||||
return null
|
||||
}
|
||||
export {
|
||||
comparePixelAtLocation,
|
||||
validateClickTarget,
|
||||
} from "./pixelCompare.js";
|
||||
export type { CropRawPatchFn, PixelCompareResult } from "./pixelCompare.js";
|
||||
|
||||
153
packages/@ant/computer-use-mcp/src/keyBlocklist.ts
Normal file
153
packages/@ant/computer-use-mcp/src/keyBlocklist.ts
Normal file
@@ -0,0 +1,153 @@
|
||||
/**
|
||||
* Key combos that cross app boundaries or terminate processes. Gated behind
|
||||
* the `systemKeyCombos` grant flag. When that flag is off, the `key` tool
|
||||
* rejects these and returns a tool error telling the model to request the
|
||||
* flag; all other combos work normally.
|
||||
*
|
||||
* Matching is canonicalized: every modifier alias the Rust executor accepts
|
||||
* collapses to one canonical name. Without this, `command+q` / `meta+q` /
|
||||
* `cmd+alt+escape` bypass the gate — see keyBlocklist.test.ts for the three
|
||||
* bypass forms and the Rust parity check that catches future alias drift.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Every modifier alias enigo_wrap.rs accepts (two copies: :351-359, :564-572),
|
||||
* mapped to one canonical per Key:: variant. Left/right variants collapse —
|
||||
* the blocklist doesn't distinguish which Ctrl.
|
||||
*
|
||||
* Canonical names are Rust's own variant names lowercased. Blocklist entries
|
||||
* below use ONLY these. "meta" reads odd for Cmd+Q but it's honest: Rust
|
||||
* sends Key::Meta, which is Cmd on darwin and Win on win32.
|
||||
*/
|
||||
const CANONICAL_MODIFIER: Readonly<Record<string, string>> = {
|
||||
// Key::Meta — "meta"|"super"|"command"|"cmd"|"windows"|"win"
|
||||
meta: "meta",
|
||||
super: "meta",
|
||||
command: "meta",
|
||||
cmd: "meta",
|
||||
windows: "meta",
|
||||
win: "meta",
|
||||
// Key::Control + LControl + RControl
|
||||
ctrl: "ctrl",
|
||||
control: "ctrl",
|
||||
lctrl: "ctrl",
|
||||
lcontrol: "ctrl",
|
||||
rctrl: "ctrl",
|
||||
rcontrol: "ctrl",
|
||||
// Key::Shift + LShift + RShift
|
||||
shift: "shift",
|
||||
lshift: "shift",
|
||||
rshift: "shift",
|
||||
// Key::Alt and Key::Option — distinct Rust variants but same keycode on
|
||||
// darwin (kVK_Option). Collapse: cmd+alt+escape and cmd+option+escape
|
||||
// both Force Quit.
|
||||
alt: "alt",
|
||||
option: "alt",
|
||||
};
|
||||
|
||||
/** Sort order for canonicals. ctrl < alt < shift < meta. */
|
||||
const MODIFIER_ORDER = ["ctrl", "alt", "shift", "meta"];
|
||||
|
||||
/**
|
||||
* Canonical-form entries only. Every modifier must be a CANONICAL_MODIFIER
|
||||
* *value* (not key), modifiers must be in MODIFIER_ORDER, non-modifier last.
|
||||
* The self-consistency test enforces this.
|
||||
*/
|
||||
const BLOCKED_DARWIN = new Set([
|
||||
"meta+q", // Cmd+Q — quit frontmost app
|
||||
"shift+meta+q", // Cmd+Shift+Q — log out
|
||||
"alt+meta+escape", // Cmd+Option+Esc — Force Quit dialog
|
||||
"meta+tab", // Cmd+Tab — app switcher
|
||||
"meta+space", // Cmd+Space — Spotlight
|
||||
"ctrl+meta+q", // Ctrl+Cmd+Q — lock screen
|
||||
]);
|
||||
|
||||
const BLOCKED_WIN32 = new Set([
|
||||
"ctrl+alt+delete", // Secure Attention Sequence
|
||||
"alt+f4", // close window
|
||||
"alt+tab", // window switcher
|
||||
"meta+l", // Win+L — lock
|
||||
"meta+d", // Win+D — show desktop
|
||||
]);
|
||||
|
||||
/**
|
||||
* Partition into sorted-canonical modifiers and non-modifier keys.
|
||||
* Shared by normalizeKeySequence (join for display) and isSystemKeyCombo
|
||||
* (check mods+each-key to catch the cmd+q+a suffix bypass).
|
||||
*/
|
||||
function partitionKeys(seq: string): { mods: string[]; keys: string[] } {
|
||||
const parts = seq
|
||||
.toLowerCase()
|
||||
.split("+")
|
||||
.map((p) => p.trim())
|
||||
.filter(Boolean);
|
||||
const mods: string[] = [];
|
||||
const keys: string[] = [];
|
||||
for (const p of parts) {
|
||||
const canonical = CANONICAL_MODIFIER[p];
|
||||
if (canonical !== undefined) {
|
||||
mods.push(canonical);
|
||||
} else {
|
||||
keys.push(p);
|
||||
}
|
||||
}
|
||||
// Dedupe: "cmd+command+q" → "meta+q", not "meta+meta+q".
|
||||
const uniqueMods = [...new Set(mods)];
|
||||
uniqueMods.sort(
|
||||
(a, b) => MODIFIER_ORDER.indexOf(a) - MODIFIER_ORDER.indexOf(b),
|
||||
);
|
||||
return { mods: uniqueMods, keys };
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize "Cmd + Shift + Q" → "shift+meta+q": lowercase, trim, alias →
|
||||
* canonical, dedupe, sort modifiers, non-modifiers last.
|
||||
*/
|
||||
export function normalizeKeySequence(seq: string): string {
|
||||
const { mods, keys } = partitionKeys(seq);
|
||||
return [...mods, ...keys].join("+");
|
||||
}
|
||||
|
||||
/**
|
||||
* True if the sequence would fire a blocked OS shortcut.
|
||||
*
|
||||
* Checks mods + EACH non-modifier key individually, not just the full
|
||||
* joined string. `cmd+q+a` → Rust presses Cmd, then Q (Cmd+Q fires here),
|
||||
* then A. Exact-match against "meta+q+a" misses; checking "meta+q" and
|
||||
* "meta+a" separately catches the Q.
|
||||
*
|
||||
* Modifiers-only sequences ("cmd+shift") are checked as-is — no key to
|
||||
* pair with, and no blocklist entry is modifier-only, so this is a no-op
|
||||
* that falls through to false. Covers the click-modifier case where
|
||||
* `left_click(text="cmd")` is legitimate.
|
||||
*/
|
||||
export function isSystemKeyCombo(
|
||||
seq: string,
|
||||
platform: "darwin" | "win32",
|
||||
): boolean {
|
||||
const blocklist = platform === "darwin" ? BLOCKED_DARWIN : BLOCKED_WIN32;
|
||||
const { mods, keys } = partitionKeys(seq);
|
||||
const prefix = mods.length > 0 ? mods.join("+") + "+" : "";
|
||||
|
||||
// No non-modifier keys (e.g. "cmd+shift" as click-modifiers) — check the
|
||||
// whole thing. Never matches (no blocklist entry is modifier-only) but
|
||||
// keeps the contract simple: every call reaches a .has().
|
||||
if (keys.length === 0) {
|
||||
return blocklist.has(mods.join("+"));
|
||||
}
|
||||
|
||||
// mods + each key. Any hit blocks the whole sequence.
|
||||
for (const key of keys) {
|
||||
if (blocklist.has(prefix + key)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
export const _test = {
|
||||
CANONICAL_MODIFIER,
|
||||
BLOCKED_DARWIN,
|
||||
BLOCKED_WIN32,
|
||||
MODIFIER_ORDER,
|
||||
};
|
||||
313
packages/@ant/computer-use-mcp/src/mcpServer.ts
Normal file
313
packages/@ant/computer-use-mcp/src/mcpServer.ts
Normal file
@@ -0,0 +1,313 @@
|
||||
/**
|
||||
* MCP server factory + session-context binder.
|
||||
*
|
||||
* Two entry points:
|
||||
*
|
||||
* `bindSessionContext` — the wrapper closure. Takes a `ComputerUseSessionContext`
|
||||
* (getters + callbacks backed by host session state), returns a dispatcher.
|
||||
* Reusable by both the MCP CallTool handler here AND Cowork's
|
||||
* `InternalServerDefinition.handleToolCall` (which doesn't go through MCP).
|
||||
* This replaces the duplicated wrapper closures in apps/desktop/…/serverDef.ts
|
||||
* and the Claude Code CLI's CU host wrapper — both did the same thing: build `ComputerUseOverrides`
|
||||
* fresh from getters, call `handleToolCall`, stash screenshot, merge permissions.
|
||||
*
|
||||
* `createComputerUseMcpServer` — the Server object. When `context` is provided,
|
||||
* the CallTool handler is real (uses `bindSessionContext`). When not, it's the
|
||||
* legacy stub that returns a not-wired error. The tool-schema ListTools handler
|
||||
* is the same either way.
|
||||
*/
|
||||
|
||||
import { Server } from "@modelcontextprotocol/sdk/server/index.js";
|
||||
import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
|
||||
import {
|
||||
CallToolRequestSchema,
|
||||
ListToolsRequestSchema,
|
||||
} from "@modelcontextprotocol/sdk/types.js";
|
||||
|
||||
import type { ScreenshotResult } from "./executor.js";
|
||||
import type { CuCallToolResult } from "./toolCalls.js";
|
||||
import {
|
||||
defersLockAcquire,
|
||||
handleToolCall,
|
||||
resetMouseButtonHeld,
|
||||
} from "./toolCalls.js";
|
||||
import { buildComputerUseTools } from "./tools.js";
|
||||
import type {
|
||||
AppGrant,
|
||||
ComputerUseHostAdapter,
|
||||
ComputerUseOverrides,
|
||||
ComputerUseSessionContext,
|
||||
CoordinateMode,
|
||||
CuGrantFlags,
|
||||
CuPermissionResponse,
|
||||
} from "./types.js";
|
||||
import { DEFAULT_GRANT_FLAGS } from "./types.js";
|
||||
|
||||
const DEFAULT_LOCK_HELD_MESSAGE =
|
||||
"Another Claude session is currently using the computer. Wait for that " +
|
||||
"session to finish, or find a non-computer-use approach.";
|
||||
|
||||
/**
|
||||
* Dedupe `granted` into `existing` on bundleId, spread truthy-only flags over
|
||||
* defaults+existing. Truthy-only: a subsequent `request_access` that doesn't
|
||||
* request clipboard can't revoke an earlier clipboard grant — revocation lives
|
||||
* in a Settings page, not here.
|
||||
*
|
||||
* Same merge both hosts implemented independently today.
|
||||
*/
|
||||
function mergePermissionResponse(
|
||||
existing: readonly AppGrant[],
|
||||
existingFlags: CuGrantFlags,
|
||||
response: CuPermissionResponse,
|
||||
): { apps: AppGrant[]; flags: CuGrantFlags } {
|
||||
const seen = new Set(existing.map((a) => a.bundleId));
|
||||
const apps = [
|
||||
...existing,
|
||||
...response.granted.filter((g) => !seen.has(g.bundleId)),
|
||||
];
|
||||
const truthyFlags = Object.fromEntries(
|
||||
Object.entries(response.flags).filter(([, v]) => v === true),
|
||||
);
|
||||
const flags: CuGrantFlags = {
|
||||
...DEFAULT_GRANT_FLAGS,
|
||||
...existingFlags,
|
||||
...truthyFlags,
|
||||
};
|
||||
return { apps, flags };
|
||||
}
|
||||
|
||||
/**
|
||||
* Bind session state to a reusable dispatcher. The returned function is the
|
||||
* wrapper closure: async lock gate → build overrides fresh → `handleToolCall`
|
||||
* → stash screenshot → strip piggybacked fields.
|
||||
*
|
||||
* The last-screenshot blob is held in a closure cell here (not on `ctx`), so
|
||||
* hosts don't need to guarantee `ctx` object identity across calls — they just
|
||||
* need to hold onto the returned dispatcher. Cowork caches per
|
||||
* `InternalServerContext` in a WeakMap; the CLI host constructs once at server creation.
|
||||
*/
|
||||
export function bindSessionContext(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
coordinateMode: CoordinateMode,
|
||||
ctx: ComputerUseSessionContext,
|
||||
): (name: string, args: unknown) => Promise<CuCallToolResult> {
|
||||
const { logger, serverName } = adapter;
|
||||
|
||||
// Screenshot blob persists here across calls — NOT on `ctx`. Hosts hold
|
||||
// onto the returned dispatcher; that's the identity that matters.
|
||||
let lastScreenshot: ScreenshotResult | undefined;
|
||||
|
||||
const wrapPermission = ctx.onPermissionRequest
|
||||
? async (
|
||||
req: Parameters<NonNullable<typeof ctx.onPermissionRequest>>[0],
|
||||
signal: AbortSignal,
|
||||
): Promise<CuPermissionResponse> => {
|
||||
const response = await ctx.onPermissionRequest!(req, signal);
|
||||
const { apps, flags } = mergePermissionResponse(
|
||||
ctx.getAllowedApps(),
|
||||
ctx.getGrantFlags(),
|
||||
response,
|
||||
);
|
||||
logger.debug(
|
||||
`[${serverName}] permission result: granted=${response.granted.length} denied=${response.denied.length}`,
|
||||
);
|
||||
ctx.onAllowedAppsChanged?.(apps, flags);
|
||||
return response;
|
||||
}
|
||||
: undefined;
|
||||
|
||||
const wrapTeachPermission = ctx.onTeachPermissionRequest
|
||||
? async (
|
||||
req: Parameters<NonNullable<typeof ctx.onTeachPermissionRequest>>[0],
|
||||
signal: AbortSignal,
|
||||
): Promise<CuPermissionResponse> => {
|
||||
const response = await ctx.onTeachPermissionRequest!(req, signal);
|
||||
logger.debug(
|
||||
`[${serverName}] teach permission result: granted=${response.granted.length} denied=${response.denied.length}`,
|
||||
);
|
||||
// Teach doesn't request grant flags — preserve existing.
|
||||
const { apps } = mergePermissionResponse(
|
||||
ctx.getAllowedApps(),
|
||||
ctx.getGrantFlags(),
|
||||
response,
|
||||
);
|
||||
ctx.onAllowedAppsChanged?.(apps, {
|
||||
...DEFAULT_GRANT_FLAGS,
|
||||
...ctx.getGrantFlags(),
|
||||
});
|
||||
return response;
|
||||
}
|
||||
: undefined;
|
||||
|
||||
return async (name, args) => {
|
||||
// ─── Async lock gate ─────────────────────────────────────────────────
|
||||
// Replaces the sync Gate-3 in `handleToolCall` — we pass
|
||||
// `checkCuLock: undefined` below so it no-ops. Hosts with
|
||||
// cross-process locks (O_EXCL file) await the real primitive here
|
||||
// instead of pre-computing + feeding a fake sync result.
|
||||
if (ctx.checkCuLock) {
|
||||
const lock = await ctx.checkCuLock();
|
||||
if (lock.holder !== undefined && !lock.isSelf) {
|
||||
const text =
|
||||
ctx.formatLockHeldMessage?.(lock.holder) ?? DEFAULT_LOCK_HELD_MESSAGE;
|
||||
return {
|
||||
content: [{ type: "text", text }],
|
||||
isError: true,
|
||||
telemetry: { error_kind: "cu_lock_held" },
|
||||
};
|
||||
}
|
||||
if (lock.holder === undefined && !defersLockAcquire(name)) {
|
||||
await ctx.acquireCuLock?.();
|
||||
// Re-check: the awaits above yield the microtask queue, so another
|
||||
// session's check+acquire can interleave with ours. Hosts where
|
||||
// acquire is a no-op when already held (Cowork's CuLockManager) give
|
||||
// no signal that we lost — verify we're now the holder before
|
||||
// proceeding. The CLI's O_EXCL file lock would surface this as a throw from
|
||||
// acquire instead; this re-check is a belt-and-suspenders for that
|
||||
// path too.
|
||||
const recheck = await ctx.checkCuLock();
|
||||
if (recheck.holder !== undefined && !recheck.isSelf) {
|
||||
const text =
|
||||
ctx.formatLockHeldMessage?.(recheck.holder) ??
|
||||
DEFAULT_LOCK_HELD_MESSAGE;
|
||||
return {
|
||||
content: [{ type: "text", text }],
|
||||
isError: true,
|
||||
telemetry: { error_kind: "cu_lock_held" },
|
||||
};
|
||||
}
|
||||
// Fresh holder → any prior session's mouseButtonHeld is stale.
|
||||
// Mirrors what Gate-3 does on the acquire branch. After the
|
||||
// re-check so we only clear module state when we actually won.
|
||||
resetMouseButtonHeld();
|
||||
}
|
||||
}
|
||||
|
||||
// ─── Build overrides fresh ───────────────────────────────────────────
|
||||
// Blob-first; dims-fallback with base64:"" when the closure cell is
|
||||
// unset (cross-respawn). scaleCoord reads dims; pixelCompare sees "" →
|
||||
// isEmpty → skip.
|
||||
const dimsFallback = lastScreenshot
|
||||
? undefined
|
||||
: ctx.getLastScreenshotDims?.();
|
||||
|
||||
// Per-call AbortController for dialog dismissal. Aborted in `finally` —
|
||||
// if handleToolCall finishes (MCP timeout, throw) before the user
|
||||
// answers, the host's dialog handler sees the abort and tears down.
|
||||
const dialogAbort = new AbortController();
|
||||
|
||||
const overrides: ComputerUseOverrides = {
|
||||
allowedApps: [...ctx.getAllowedApps()],
|
||||
grantFlags: ctx.getGrantFlags(),
|
||||
userDeniedBundleIds: ctx.getUserDeniedBundleIds(),
|
||||
coordinateMode,
|
||||
selectedDisplayId: ctx.getSelectedDisplayId(),
|
||||
displayPinnedByModel: ctx.getDisplayPinnedByModel?.(),
|
||||
displayResolvedForApps: ctx.getDisplayResolvedForApps?.(),
|
||||
lastScreenshot:
|
||||
lastScreenshot ??
|
||||
(dimsFallback ? { ...dimsFallback, base64: "" } : undefined),
|
||||
onPermissionRequest: wrapPermission
|
||||
? (req) => wrapPermission(req, dialogAbort.signal)
|
||||
: undefined,
|
||||
onTeachPermissionRequest: wrapTeachPermission
|
||||
? (req) => wrapTeachPermission(req, dialogAbort.signal)
|
||||
: undefined,
|
||||
onAppsHidden: ctx.onAppsHidden,
|
||||
getClipboardStash: ctx.getClipboardStash,
|
||||
onClipboardStashChanged: ctx.onClipboardStashChanged,
|
||||
onResolvedDisplayUpdated: ctx.onResolvedDisplayUpdated,
|
||||
onDisplayPinned: ctx.onDisplayPinned,
|
||||
onDisplayResolvedForApps: ctx.onDisplayResolvedForApps,
|
||||
onTeachModeActivated: ctx.onTeachModeActivated,
|
||||
onTeachStep: ctx.onTeachStep,
|
||||
onTeachWorking: ctx.onTeachWorking,
|
||||
getTeachModeActive: ctx.getTeachModeActive,
|
||||
// Undefined → handleToolCall's sync Gate-3 no-ops. The async gate
|
||||
// above already ran.
|
||||
checkCuLock: undefined,
|
||||
acquireCuLock: undefined,
|
||||
isAborted: ctx.isAborted,
|
||||
};
|
||||
|
||||
logger.debug(
|
||||
`[${serverName}] tool=${name} allowedApps=${overrides.allowedApps.length} coordMode=${coordinateMode}`,
|
||||
);
|
||||
|
||||
// ─── Dispatch ────────────────────────────────────────────────────────
|
||||
try {
|
||||
const result = await handleToolCall(adapter, name, args, overrides);
|
||||
|
||||
if (result.screenshot) {
|
||||
lastScreenshot = result.screenshot;
|
||||
const { base64: _blob, ...dims } = result.screenshot;
|
||||
logger.debug(`[${serverName}] screenshot dims: ${JSON.stringify(dims)}`);
|
||||
ctx.onScreenshotCaptured?.(dims);
|
||||
}
|
||||
|
||||
return result;
|
||||
} finally {
|
||||
dialogAbort.abort();
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
export function createComputerUseMcpServer(
|
||||
adapter: ComputerUseHostAdapter,
|
||||
coordinateMode: CoordinateMode,
|
||||
context?: ComputerUseSessionContext,
|
||||
): Server {
|
||||
const { serverName, logger } = adapter;
|
||||
|
||||
const server = new Server(
|
||||
{ name: serverName, version: "0.1.3" },
|
||||
{ capabilities: { tools: {}, logging: {} } },
|
||||
);
|
||||
|
||||
const tools = buildComputerUseTools(
|
||||
adapter.executor.capabilities,
|
||||
coordinateMode,
|
||||
);
|
||||
|
||||
server.setRequestHandler(ListToolsRequestSchema, async () =>
|
||||
adapter.isDisabled() ? { tools: [] } : { tools },
|
||||
);
|
||||
|
||||
if (context) {
|
||||
const dispatch = bindSessionContext(adapter, coordinateMode, context);
|
||||
server.setRequestHandler(
|
||||
CallToolRequestSchema,
|
||||
async (request): Promise<CallToolResult> => {
|
||||
const { screenshot: _s, telemetry: _t, ...result } = await dispatch(
|
||||
request.params.name,
|
||||
request.params.arguments ?? {},
|
||||
);
|
||||
return result;
|
||||
},
|
||||
);
|
||||
return server;
|
||||
}
|
||||
|
||||
// Legacy: no context → stub handler. Reached only if something calls the
|
||||
// server over MCP transport WITHOUT going through a binder (a wiring
|
||||
// regression). Clear error instead of silent failure.
|
||||
server.setRequestHandler(
|
||||
CallToolRequestSchema,
|
||||
async (request): Promise<CallToolResult> => {
|
||||
logger.warn(
|
||||
`[${serverName}] tool call "${request.params.name}" reached the stub handler — no session context bound. Per-session state unavailable.`,
|
||||
);
|
||||
return {
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: "This computer-use server instance is not wired to a session. Per-session app permissions are not available on this code path.",
|
||||
},
|
||||
],
|
||||
isError: true,
|
||||
};
|
||||
},
|
||||
);
|
||||
|
||||
return server;
|
||||
}
|
||||
171
packages/@ant/computer-use-mcp/src/pixelCompare.ts
Normal file
171
packages/@ant/computer-use-mcp/src/pixelCompare.ts
Normal file
@@ -0,0 +1,171 @@
|
||||
/**
|
||||
* Staleness guard ported from the Vercept acquisition.
|
||||
*
|
||||
* Compares the model's last-seen screenshot against a fresh-right-now
|
||||
* screenshot at the click target, so the model never clicks pixels it hasn't
|
||||
* seen. If the 9×9 patch around the target differs, the click is aborted and
|
||||
* the model is told to re-screenshot. This is NOT a popup detector.
|
||||
*
|
||||
* Semantics preserved exactly:
|
||||
* - Skip on no `lastScreenshot` (cold start) — click proceeds.
|
||||
* - Skip on any internal error (crop throws, screenshot fails, etc.) —
|
||||
* click proceeds. Validation failure must never block the action.
|
||||
* - 9×9 exact byte equality on raw pixel bytes. No fuzzing, no tolerance.
|
||||
* - Compare in percentage coords so Retina scale doesn't matter.
|
||||
*
|
||||
* JPEG decode + crop is INJECTED via `ComputerUseHostAdapter.cropRawPatch`.
|
||||
* The original used `sharp` (LGPL, native `.node` addon); we inject Electron's
|
||||
* `nativeImage` (Chromium decoders, BSD, nothing to bundle) from the host, so
|
||||
* this package never imports it — the crop is a function parameter.
|
||||
*/
|
||||
|
||||
import type { ScreenshotResult } from "./executor.js";
|
||||
import type { Logger } from "./types.js";
|
||||
|
||||
/** Injected by the host. See `ComputerUseHostAdapter.cropRawPatch`. */
|
||||
export type CropRawPatchFn = (
|
||||
jpegBase64: string,
|
||||
rect: { x: number; y: number; width: number; height: number },
|
||||
) => Buffer | null;
|
||||
|
||||
/** 9×9 is empirically the sweet spot — large enough to catch a tooltip
|
||||
* appearing, small enough to not false-positive on surrounding animation.
|
||||
**/
|
||||
const DEFAULT_GRID_SIZE = 9;
|
||||
|
||||
export interface PixelCompareResult {
|
||||
/** true → click may proceed. false → patch changed, abort the click. */
|
||||
valid: boolean;
|
||||
/** true → validation did not run (cold start, sub-gate off, or internal
|
||||
* error). The caller MUST treat this identically to `valid: true`. */
|
||||
skipped: boolean;
|
||||
/** Populated when valid === false. Returned to the model verbatim. */
|
||||
warning?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute the crop rect for a patch centered on (xPercent, yPercent).
|
||||
*
|
||||
* Dimensions come from ScreenshotResult.width/height (physical pixels). Both
|
||||
* screenshots have the same dimensions (same display, consecutive captures),
|
||||
* so the rect is the same for both.
|
||||
*/
|
||||
function computeCropRect(
|
||||
imgW: number,
|
||||
imgH: number,
|
||||
xPercent: number,
|
||||
yPercent: number,
|
||||
gridSize: number,
|
||||
): { x: number; y: number; width: number; height: number } | null {
|
||||
if (!imgW || !imgH) return null;
|
||||
|
||||
const clampedX = Math.max(0, Math.min(100, xPercent));
|
||||
const clampedY = Math.max(0, Math.min(100, yPercent));
|
||||
|
||||
const centerX = Math.round((clampedX / 100.0) * imgW);
|
||||
const centerY = Math.round((clampedY / 100.0) * imgH);
|
||||
|
||||
const halfGrid = Math.floor(gridSize / 2);
|
||||
const cropX = Math.max(0, centerX - halfGrid);
|
||||
const cropY = Math.max(0, centerY - halfGrid);
|
||||
const cropW = Math.min(gridSize, imgW - cropX);
|
||||
const cropH = Math.min(gridSize, imgH - cropY);
|
||||
if (cropW <= 0 || cropH <= 0) return null;
|
||||
|
||||
return { x: cropX, y: cropY, width: cropW, height: cropH };
|
||||
}
|
||||
|
||||
/**
|
||||
* Compare the same patch location between two screenshots.
|
||||
*
|
||||
* @returns true when the raw pixel bytes are identical. false on any
|
||||
* difference, or on any internal error (the caller treats an error here as
|
||||
* `skipped`, so the false is harmless).
|
||||
*/
|
||||
export function comparePixelAtLocation(
|
||||
crop: CropRawPatchFn,
|
||||
lastScreenshot: ScreenshotResult,
|
||||
freshScreenshot: ScreenshotResult,
|
||||
xPercent: number,
|
||||
yPercent: number,
|
||||
gridSize: number = DEFAULT_GRID_SIZE,
|
||||
): boolean {
|
||||
// Both screenshots are of the same display — use the fresh one's
|
||||
// dimensions (less likely to be stale than last's).
|
||||
const rect = computeCropRect(
|
||||
freshScreenshot.width,
|
||||
freshScreenshot.height,
|
||||
xPercent,
|
||||
yPercent,
|
||||
gridSize,
|
||||
);
|
||||
if (!rect) return false;
|
||||
|
||||
const patch1 = crop(lastScreenshot.base64, rect);
|
||||
const patch2 = crop(freshScreenshot.base64, rect);
|
||||
if (!patch1 || !patch2) return false;
|
||||
|
||||
// Direct buffer equality. Note: nativeImage.toBitmap() gives BGRA, sharp's
|
||||
// .raw() gave RGB.
|
||||
// Doesn't matter — we're comparing two same-format buffers for equality.
|
||||
return patch1.equals(patch2);
|
||||
}
|
||||
|
||||
/**
|
||||
* Battle-tested click-target validation ported from the Vercept acquisition,
|
||||
* with the fresh-screenshot capture delegated to the caller (we don't have
|
||||
* a global `SystemActions.takeScreenshot()` — the executor is injected).
|
||||
*
|
||||
* Skip conditions (any of these → `{ valid: true, skipped: true }`):
|
||||
* - `lastScreenshot` is undefined (cold start).
|
||||
* - `takeFreshScreenshot()` throws or returns null.
|
||||
* - Injected crop function returns null (decode failure).
|
||||
* - Any other exception.
|
||||
*
|
||||
* The caller decides whether to invoke this at all (sub-gate check lives
|
||||
* in toolCalls.ts, not here).
|
||||
*/
|
||||
export async function validateClickTarget(
|
||||
crop: CropRawPatchFn,
|
||||
lastScreenshot: ScreenshotResult | undefined,
|
||||
xPercent: number,
|
||||
yPercent: number,
|
||||
takeFreshScreenshot: () => Promise<ScreenshotResult | null>,
|
||||
logger: Logger,
|
||||
gridSize: number = DEFAULT_GRID_SIZE,
|
||||
): Promise<PixelCompareResult> {
|
||||
if (!lastScreenshot) {
|
||||
return { valid: true, skipped: true };
|
||||
}
|
||||
|
||||
try {
|
||||
const fresh = await takeFreshScreenshot();
|
||||
if (!fresh) {
|
||||
return { valid: true, skipped: true };
|
||||
}
|
||||
|
||||
const pixelsMatch = comparePixelAtLocation(
|
||||
crop,
|
||||
lastScreenshot,
|
||||
fresh,
|
||||
xPercent,
|
||||
yPercent,
|
||||
gridSize,
|
||||
);
|
||||
|
||||
if (pixelsMatch) {
|
||||
return { valid: true, skipped: false };
|
||||
}
|
||||
return {
|
||||
valid: false,
|
||||
skipped: false,
|
||||
warning:
|
||||
"Screen content at the target location changed since the last screenshot. Take a new screenshot before clicking.",
|
||||
};
|
||||
} catch (err) {
|
||||
// Skip validation on technical errors, execute action anyway.
|
||||
// Battle-tested: validation failure must never block the click.
|
||||
logger.debug("[pixelCompare] validation error, skipping", err);
|
||||
return { valid: true, skipped: true };
|
||||
}
|
||||
}
|
||||
@@ -1,32 +1,43 @@
|
||||
/**
|
||||
* Sentinel apps — 需要特殊权限警告的应用列表
|
||||
* Bundle IDs that are escalations-in-disguise. The approval UI shows a warning
|
||||
* badge for these; they are NOT blocked. Power users may legitimately want the
|
||||
* model controlling a terminal.
|
||||
*
|
||||
* 包含终端、文件管理器、系统设置等敏感应用。
|
||||
* Computer Use 操作这些应用时会显示额外警告。
|
||||
* Imported by the renderer via the `./sentinelApps` subpath (package.json
|
||||
* `exports`), which keeps Next.js from reaching index.ts → mcpServer.ts →
|
||||
* @modelcontextprotocol/sdk (devDep, would fail module resolution). Keep
|
||||
* this file import-free so the subpath stays clean.
|
||||
*/
|
||||
|
||||
type SentinelCategory = 'shell' | 'filesystem' | 'system_settings'
|
||||
/** These apps can execute arbitrary shell commands. */
|
||||
const SHELL_ACCESS_BUNDLE_IDS = new Set([
|
||||
"com.apple.Terminal",
|
||||
"com.googlecode.iterm2",
|
||||
"com.microsoft.VSCode",
|
||||
"dev.warp.Warp-Stable",
|
||||
"com.github.wez.wezterm",
|
||||
"io.alacritty",
|
||||
"net.kovidgoyal.kitty",
|
||||
"com.jetbrains.intellij",
|
||||
"com.jetbrains.pycharm",
|
||||
]);
|
||||
|
||||
const SENTINEL_MAP: Record<string, SentinelCategory> = {
|
||||
// Shell / Terminal
|
||||
'com.apple.Terminal': 'shell',
|
||||
'com.googlecode.iterm2': 'shell',
|
||||
'dev.warp.Warp-Stable': 'shell',
|
||||
'io.alacritty': 'shell',
|
||||
'com.github.wez.wezterm': 'shell',
|
||||
'net.kovidgoyal.kitty': 'shell',
|
||||
'co.zeit.hyper': 'shell',
|
||||
/** Finder in the allowlist ≈ browse + open-any-file. */
|
||||
const FILESYSTEM_ACCESS_BUNDLE_IDS = new Set(["com.apple.finder"]);
|
||||
|
||||
// Filesystem
|
||||
'com.apple.finder': 'filesystem',
|
||||
const SYSTEM_SETTINGS_BUNDLE_IDS = new Set(["com.apple.systempreferences"]);
|
||||
|
||||
// System Settings
|
||||
'com.apple.systempreferences': 'system_settings',
|
||||
'com.apple.SystemPreferences': 'system_settings',
|
||||
}
|
||||
export const SENTINEL_BUNDLE_IDS: ReadonlySet<string> = new Set([
|
||||
...SHELL_ACCESS_BUNDLE_IDS,
|
||||
...FILESYSTEM_ACCESS_BUNDLE_IDS,
|
||||
...SYSTEM_SETTINGS_BUNDLE_IDS,
|
||||
]);
|
||||
|
||||
export const sentinelApps: string[] = Object.keys(SENTINEL_MAP)
|
||||
export type SentinelCategory = "shell" | "filesystem" | "system_settings";
|
||||
|
||||
export function getSentinelCategory(bundleId: string): SentinelCategory | null {
|
||||
return SENTINEL_MAP[bundleId] ?? null
|
||||
if (SHELL_ACCESS_BUNDLE_IDS.has(bundleId)) return "shell";
|
||||
if (FILESYSTEM_ACCESS_BUNDLE_IDS.has(bundleId)) return "filesystem";
|
||||
if (SYSTEM_SETTINGS_BUNDLE_IDS.has(bundleId)) return "system_settings";
|
||||
return null;
|
||||
}
|
||||
|
||||
19
packages/@ant/computer-use-mcp/src/subGates.ts
Normal file
19
packages/@ant/computer-use-mcp/src/subGates.ts
Normal file
@@ -0,0 +1,19 @@
|
||||
import type { CuSubGates } from './types.js'
|
||||
|
||||
export const ALL_SUB_GATES_ON: CuSubGates = {
|
||||
pixelValidation: true,
|
||||
clipboardPasteMultiline: true,
|
||||
mouseAnimation: true,
|
||||
hideBeforeAction: true,
|
||||
autoTargetDisplay: true,
|
||||
clipboardGuard: true,
|
||||
}
|
||||
|
||||
export const ALL_SUB_GATES_OFF: CuSubGates = {
|
||||
pixelValidation: false,
|
||||
clipboardPasteMultiline: false,
|
||||
mouseAnimation: false,
|
||||
hideBeforeAction: false,
|
||||
autoTargetDisplay: false,
|
||||
clipboardGuard: false,
|
||||
}
|
||||
3649
packages/@ant/computer-use-mcp/src/toolCalls.ts
Normal file
3649
packages/@ant/computer-use-mcp/src/toolCalls.ts
Normal file
File diff suppressed because it is too large
Load Diff
706
packages/@ant/computer-use-mcp/src/tools.ts
Normal file
706
packages/@ant/computer-use-mcp/src/tools.ts
Normal file
@@ -0,0 +1,706 @@
|
||||
/**
|
||||
* MCP tool schemas for the computer-use server. Mirrors
|
||||
* claude-for-chrome-mcp/src/browserTools.ts in shape (plain `Tool`-shaped
|
||||
* object literals, no zod).
|
||||
*
|
||||
* Coordinate descriptions are baked in at tool-list build time from the
|
||||
* `chicago_coordinate_mode` gate. The model sees exactly ONE coordinate
|
||||
* convention in the param descriptions and never learns the other exists.
|
||||
* The host (`serverDef.ts`) reads the same frozen gate value for
|
||||
* `scaleCoord` — both must agree or clicks land in the wrong space.
|
||||
*/
|
||||
|
||||
import type { Tool } from "@modelcontextprotocol/sdk/types.js";
|
||||
|
||||
import type { CoordinateMode } from "./types.js";
|
||||
|
||||
// See packages/desktop/computer-use-mcp/COORDINATES.md before touching any
|
||||
// model-facing coordinate text. Chrome's browserTools.ts:143 is the reference
|
||||
// phrasing — "pixels from the left edge", no geometry, no number to do math with.
|
||||
const COORD_DESC: Record<CoordinateMode, { x: string; y: string }> = {
|
||||
pixels: {
|
||||
x: "Horizontal pixel position read directly from the most recent screenshot image, measured from the left edge. The server handles all scaling.",
|
||||
y: "Vertical pixel position read directly from the most recent screenshot image, measured from the top edge. The server handles all scaling.",
|
||||
},
|
||||
normalized_0_100: {
|
||||
x: "Horizontal position as a percentage of screen width, 0.0–100.0 (0 = left edge, 100 = right edge).",
|
||||
y: "Vertical position as a percentage of screen height, 0.0–100.0 (0 = top edge, 100 = bottom edge).",
|
||||
},
|
||||
};
|
||||
|
||||
const FRONTMOST_GATE_DESC =
|
||||
"The frontmost application must be in the session allowlist at the time of this call, or this tool returns an error and does nothing.";
|
||||
|
||||
/**
|
||||
* Item schema for the `actions` array in `computer_batch`, `teach_step`, and
|
||||
* `teach_batch`. All three dispatch through the same `dispatchAction` path
|
||||
* with the same validation — keep this enum in sync with `BATCHABLE_ACTIONS`
|
||||
* in toolCalls.ts.
|
||||
*/
|
||||
const BATCH_ACTION_ITEM_SCHEMA = {
|
||||
type: "object",
|
||||
properties: {
|
||||
action: {
|
||||
type: "string",
|
||||
enum: [
|
||||
"key",
|
||||
"type",
|
||||
"mouse_move",
|
||||
"left_click",
|
||||
"left_click_drag",
|
||||
"right_click",
|
||||
"middle_click",
|
||||
"double_click",
|
||||
"triple_click",
|
||||
"scroll",
|
||||
"hold_key",
|
||||
"screenshot",
|
||||
"cursor_position",
|
||||
"left_mouse_down",
|
||||
"left_mouse_up",
|
||||
"wait",
|
||||
],
|
||||
description: "The action to perform.",
|
||||
},
|
||||
coordinate: {
|
||||
type: "array",
|
||||
items: { type: "number" },
|
||||
minItems: 2,
|
||||
maxItems: 2,
|
||||
description:
|
||||
"(x, y) for click/mouse_move/scroll/left_click_drag end point.",
|
||||
},
|
||||
start_coordinate: {
|
||||
type: "array",
|
||||
items: { type: "number" },
|
||||
minItems: 2,
|
||||
maxItems: 2,
|
||||
description:
|
||||
"(x, y) drag start — left_click_drag only. Omit to drag from current cursor.",
|
||||
},
|
||||
text: {
|
||||
type: "string",
|
||||
description:
|
||||
"For type: the text. For key/hold_key: the chord string. For click/scroll: modifier keys to hold.",
|
||||
},
|
||||
scroll_direction: {
|
||||
type: "string",
|
||||
enum: ["up", "down", "left", "right"],
|
||||
},
|
||||
scroll_amount: { type: "integer", minimum: 0, maximum: 100 },
|
||||
duration: {
|
||||
type: "number",
|
||||
description: "Seconds (0–100). For hold_key/wait.",
|
||||
},
|
||||
repeat: {
|
||||
type: "integer",
|
||||
minimum: 1,
|
||||
maximum: 100,
|
||||
description: "For key: repeat count.",
|
||||
},
|
||||
},
|
||||
required: ["action"],
|
||||
};
|
||||
|
||||
/**
|
||||
* Build the tool list. Parameterized by capabilities and coordinate mode so
|
||||
* descriptions are honest and unambiguous (plan §1 — "Unfiltered + honest").
|
||||
*
|
||||
* `coordinateMode` MUST match what the host passes to `scaleCoord` at tool-
|
||||
* -call time. Both should read the same frozen-at-load gate constant.
|
||||
*
|
||||
* `installedAppNames` — optional pre-sanitized list of app display names to
|
||||
* enumerate in the `request_access` description. The caller is responsible
|
||||
* for sanitization (length cap, character allowlist, sort, count cap) —
|
||||
* this function just splices the list into the description verbatim. Omit
|
||||
* to fall back to the generic "display names or bundle IDs" wording.
|
||||
*/
|
||||
export function buildComputerUseTools(
|
||||
caps: {
|
||||
screenshotFiltering: "native" | "none";
|
||||
platform: "darwin" | "win32";
|
||||
/** Include request_teach_access + teach_step. Read once at server construction. */
|
||||
teachMode?: boolean;
|
||||
},
|
||||
coordinateMode: CoordinateMode,
|
||||
installedAppNames?: string[],
|
||||
): Tool[] {
|
||||
const coord = COORD_DESC[coordinateMode];
|
||||
|
||||
// Shared hint suffix for BOTH request_access and request_teach_access —
|
||||
// they use the same resolveRequestedApps path, so the model should get
|
||||
// the same enumeration for both.
|
||||
const installedAppsHint =
|
||||
installedAppNames && installedAppNames.length > 0
|
||||
? ` Available applications on this machine: ${installedAppNames.join(", ")}.`
|
||||
: "";
|
||||
|
||||
// [x, y]` tuple — param shape for all
|
||||
// click/move/scroll tools.
|
||||
const coordinateTuple = {
|
||||
type: "array",
|
||||
items: { type: "number" },
|
||||
minItems: 2,
|
||||
maxItems: 2,
|
||||
description: `(x, y): ${coord.x}`,
|
||||
};
|
||||
// Modifier hold during click. Shared across all 5 click variants.
|
||||
const clickModifierText = {
|
||||
type: "string",
|
||||
description:
|
||||
'Modifier keys to hold during the click (e.g. "shift", "ctrl+shift"). Supports the same syntax as the key tool.',
|
||||
};
|
||||
|
||||
const screenshotDesc =
|
||||
caps.screenshotFiltering === "native"
|
||||
? "Take a screenshot of the primary display. Applications not in the session allowlist are excluded at the compositor level — only granted apps and the desktop are visible."
|
||||
: "Take a screenshot of the primary display. On this platform, screenshots are NOT filtered — all open windows are visible. Input actions targeting apps not in the session allowlist are rejected.";
|
||||
|
||||
return [
|
||||
{
|
||||
name: "request_access",
|
||||
description:
|
||||
"Request user permission to control a set of applications for this session. Must be called before any other tool in this server. " +
|
||||
"The user sees a single dialog listing all requested apps and either allows the whole set or denies it. " +
|
||||
"Call this again mid-session to add more apps; previously granted apps remain granted. " +
|
||||
"Returns the granted apps, denied apps, and screenshot filtering capability.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
apps: {
|
||||
type: "array",
|
||||
items: { type: "string" },
|
||||
description:
|
||||
"Application display names (e.g. \"Slack\", \"Calendar\") or bundle identifiers (e.g. \"com.tinyspeck.slackmacgap\"). Display names are resolved case-insensitively against installed apps." +
|
||||
installedAppsHint,
|
||||
},
|
||||
reason: {
|
||||
type: "string",
|
||||
description:
|
||||
"One-sentence explanation shown to the user in the approval dialog. Explain the task, not the mechanism.",
|
||||
},
|
||||
clipboardRead: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Also request permission to read the user's clipboard (separate checkbox in the dialog).",
|
||||
},
|
||||
clipboardWrite: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Also request permission to write the user's clipboard. When granted, multi-line `type` calls use the clipboard fast path.",
|
||||
},
|
||||
systemKeyCombos: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Also request permission to send system-level key combos (quit app, switch app, lock screen). Without this, those specific combos are blocked.",
|
||||
},
|
||||
},
|
||||
required: ["apps", "reason"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "screenshot",
|
||||
description:
|
||||
screenshotDesc +
|
||||
" Returns an error if the allowlist is empty. The returned image is what subsequent click coordinates are relative to.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
save_to_disk: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image — screenshots you're just looking at don't need saving.",
|
||||
},
|
||||
},
|
||||
required: [],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "zoom",
|
||||
description:
|
||||
"Take a higher-resolution screenshot of a specific region of the last full-screen screenshot. Use this liberally to inspect small text, button labels, or fine UI details that are hard to read in the downsampled full-screen image. " +
|
||||
"IMPORTANT: Coordinates in subsequent click calls always refer to the full-screen screenshot, never the zoomed image. This tool is read-only for inspecting detail.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
region: {
|
||||
type: "array",
|
||||
items: { type: "integer" },
|
||||
minItems: 4,
|
||||
maxItems: 4,
|
||||
description:
|
||||
"(x0, y0, x1, y1): Rectangle to zoom into, in the coordinate space of the most recent full-screen screenshot. x0,y0 = top-left, x1,y1 = bottom-right.",
|
||||
},
|
||||
save_to_disk: {
|
||||
type: "boolean",
|
||||
description:
|
||||
"Save the image to disk so it can be attached to a message for the user. Returns the saved path in the tool result. Only set this when you intend to share the image.",
|
||||
},
|
||||
},
|
||||
required: ["region"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "left_click",
|
||||
description: `Left-click at the given coordinates. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: coordinateTuple,
|
||||
text: clickModifierText,
|
||||
},
|
||||
required: ["coordinate"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "double_click",
|
||||
description: `Double-click at the given coordinates. Selects a word in most text editors. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: coordinateTuple,
|
||||
text: clickModifierText,
|
||||
},
|
||||
required: ["coordinate"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "triple_click",
|
||||
description: `Triple-click at the given coordinates. Selects a line in most text editors. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: coordinateTuple,
|
||||
text: clickModifierText,
|
||||
},
|
||||
required: ["coordinate"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "right_click",
|
||||
description: `Right-click at the given coordinates. Opens a context menu in most applications. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: coordinateTuple,
|
||||
text: clickModifierText,
|
||||
},
|
||||
required: ["coordinate"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "middle_click",
|
||||
description: `Middle-click (scroll-wheel click) at the given coordinates. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: coordinateTuple,
|
||||
text: clickModifierText,
|
||||
},
|
||||
required: ["coordinate"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "type",
|
||||
description: `Type text into whatever currently has keyboard focus. ${FRONTMOST_GATE_DESC} Newlines are supported. For keyboard shortcuts use \`key\` instead.`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
text: { type: "string", description: "Text to type." },
|
||||
},
|
||||
required: ["text"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "key",
|
||||
description:
|
||||
`Press a key or key combination (e.g. "return", "escape", "cmd+a", "ctrl+shift+tab"). ${FRONTMOST_GATE_DESC} ` +
|
||||
"System-level combos (quit app, switch app, lock screen) require the `systemKeyCombos` grant — without it they return an error. All other combos work.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
text: {
|
||||
type: "string",
|
||||
description: 'Modifiers joined with "+", e.g. "cmd+shift+a".',
|
||||
},
|
||||
repeat: {
|
||||
type: "integer",
|
||||
minimum: 1,
|
||||
maximum: 100,
|
||||
description: "Number of times to repeat the key press. Default is 1.",
|
||||
},
|
||||
},
|
||||
required: ["text"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "scroll",
|
||||
description: `Scroll at the given coordinates. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: coordinateTuple,
|
||||
scroll_direction: {
|
||||
type: "string",
|
||||
enum: ["up", "down", "left", "right"],
|
||||
description: "Direction to scroll.",
|
||||
},
|
||||
scroll_amount: {
|
||||
type: "integer",
|
||||
minimum: 0,
|
||||
maximum: 100,
|
||||
description: "Number of scroll ticks.",
|
||||
},
|
||||
},
|
||||
required: ["coordinate", "scroll_direction", "scroll_amount"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "left_click_drag",
|
||||
description: `Press, move to target, and release. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: {
|
||||
...coordinateTuple,
|
||||
description: `(x, y) end point: ${coord.x}`,
|
||||
},
|
||||
start_coordinate: {
|
||||
...coordinateTuple,
|
||||
description: `(x, y) start point. If omitted, drags from the current cursor position. ${coord.x}`,
|
||||
},
|
||||
},
|
||||
required: ["coordinate"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "mouse_move",
|
||||
description: `Move the mouse cursor without clicking. Useful for triggering hover states. ${FRONTMOST_GATE_DESC}`,
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
coordinate: coordinateTuple,
|
||||
},
|
||||
required: ["coordinate"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "open_application",
|
||||
description:
|
||||
"Bring an application to the front, launching it if necessary. The target application must already be in the session allowlist — call request_access first.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
app: {
|
||||
type: "string",
|
||||
description:
|
||||
"Display name (e.g. \"Slack\") or bundle identifier (e.g. \"com.tinyspeck.slackmacgap\").",
|
||||
},
|
||||
},
|
||||
required: ["app"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "switch_display",
|
||||
description:
|
||||
"Switch which monitor subsequent screenshots capture. Use this when the " +
|
||||
"application you need is on a different monitor than the one shown. " +
|
||||
"The screenshot tool tells you which monitor it captured and lists " +
|
||||
"other attached monitors by name — pass one of those names here. " +
|
||||
"After switching, call screenshot to see the new monitor. " +
|
||||
'Pass "auto" to return to automatic monitor selection.',
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
display: {
|
||||
type: "string",
|
||||
description:
|
||||
'Monitor name from the screenshot note (e.g. "Built-in Retina Display", ' +
|
||||
'"LG UltraFine"), or "auto" to re-enable automatic selection.',
|
||||
},
|
||||
},
|
||||
required: ["display"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "list_granted_applications",
|
||||
description:
|
||||
"List the applications currently in the session allowlist, plus the active grant flags and coordinate mode. No side effects.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {},
|
||||
required: [],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "read_clipboard",
|
||||
description:
|
||||
"Read the current clipboard contents as text. Requires the `clipboardRead` grant.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {},
|
||||
required: [],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "write_clipboard",
|
||||
description:
|
||||
"Write text to the clipboard. Requires the `clipboardWrite` grant.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
text: { type: "string" },
|
||||
},
|
||||
required: ["text"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "wait",
|
||||
description: "Wait for a specified duration.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
duration: {
|
||||
type: "number",
|
||||
description: "Duration in seconds (0–100).",
|
||||
},
|
||||
},
|
||||
required: ["duration"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "cursor_position",
|
||||
description:
|
||||
"Get the current mouse cursor position. Returns image-pixel coordinates relative to the most recent screenshot, or logical points if no screenshot has been taken.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {},
|
||||
required: [],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "hold_key",
|
||||
description:
|
||||
`Press and hold a key or key combination for the specified duration, then release. ${FRONTMOST_GATE_DESC} ` +
|
||||
"System-level combos require the `systemKeyCombos` grant.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
text: {
|
||||
type: "string",
|
||||
description: 'Key or chord to hold, e.g. "space", "shift+down".',
|
||||
},
|
||||
duration: {
|
||||
type: "number",
|
||||
description: "Duration in seconds (0–100).",
|
||||
},
|
||||
},
|
||||
required: ["text", "duration"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "left_mouse_down",
|
||||
description:
|
||||
`Press the left mouse button at the current cursor position and leave it held. ${FRONTMOST_GATE_DESC} ` +
|
||||
"Use mouse_move first to position the cursor. Call left_mouse_up to release. Errors if the button is already held.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {},
|
||||
required: [],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "left_mouse_up",
|
||||
description:
|
||||
`Release the left mouse button at the current cursor position. ${FRONTMOST_GATE_DESC} ` +
|
||||
"Pairs with left_mouse_down. Safe to call even if the button is not currently held.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {},
|
||||
required: [],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "computer_batch",
|
||||
description:
|
||||
"Execute a sequence of actions in ONE tool call. Each individual tool call requires a model→API round trip (seconds); " +
|
||||
"batching a predictable sequence eliminates all but one. Use this whenever you can predict the outcome of several actions ahead — " +
|
||||
"e.g. click a field, type into it, press Return. Actions execute sequentially and stop on the first error. " +
|
||||
`${FRONTMOST_GATE_DESC} The frontmost check runs before EACH action inside the batch — if an action opens a non-allowed app, the next action's gate fires and the batch stops there. ` +
|
||||
"Mid-batch screenshot actions are allowed for inspection but coordinates in subsequent clicks always refer to the PRE-BATCH full-screen screenshot.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
actions: {
|
||||
type: "array",
|
||||
minItems: 1,
|
||||
items: BATCH_ACTION_ITEM_SCHEMA,
|
||||
description:
|
||||
'List of actions. Example: [{"action":"left_click","coordinate":[100,200]},{"action":"type","text":"hello"},{"action":"key","text":"Return"}]',
|
||||
},
|
||||
},
|
||||
required: ["actions"],
|
||||
},
|
||||
},
|
||||
|
||||
...(caps.teachMode ? buildTeachTools(coord, installedAppsHint) : []),
|
||||
];
|
||||
}
|
||||
|
||||
/**
|
||||
* Teach-mode tools. Split out so the spread above stays a single expression;
|
||||
* takes `coord` so `teach_step.anchor`'s description uses the same
|
||||
* frozen coordinate-mode phrasing as click coords, and `installedAppsHint`
|
||||
* so `request_teach_access.apps` gets the same enumeration as
|
||||
* `request_access.apps` (same resolution path → same hint).
|
||||
*/
|
||||
function buildTeachTools(
|
||||
coord: { x: string; y: string },
|
||||
installedAppsHint: string,
|
||||
): Tool[] {
|
||||
// Shared between teach_step (top-level) and teach_batch (inside steps[]
|
||||
// items). Depends on coord, so it lives inside this factory.
|
||||
const teachStepProperties = {
|
||||
explanation: {
|
||||
type: "string",
|
||||
description:
|
||||
"Tooltip body text. Explain what the user is looking at and why it matters. " +
|
||||
"This is the ONLY place the user sees your words — be complete but concise.",
|
||||
},
|
||||
next_preview: {
|
||||
type: "string",
|
||||
description:
|
||||
"One line describing exactly what will happen when the user clicks Next. " +
|
||||
'Example: "Next: I\'ll click Create Bucket and type the name." ' +
|
||||
"Shown below the explanation in a smaller font.",
|
||||
},
|
||||
anchor: {
|
||||
type: "array",
|
||||
items: { type: "number" },
|
||||
minItems: 2,
|
||||
maxItems: 2,
|
||||
description:
|
||||
`(x, y) — where the tooltip arrow points. ${coord.x} ` +
|
||||
"Omit to center the tooltip with no arrow (for general-context steps).",
|
||||
},
|
||||
actions: {
|
||||
type: "array",
|
||||
// Empty allowed — "read this, click Next" steps.
|
||||
items: BATCH_ACTION_ITEM_SCHEMA,
|
||||
description:
|
||||
"Actions to execute when the user clicks Next. Same item schema as computer_batch.actions. " +
|
||||
"Empty array is valid for purely explanatory steps. Actions run sequentially and stop on first error.",
|
||||
},
|
||||
} as const;
|
||||
|
||||
return [
|
||||
{
|
||||
name: "request_teach_access",
|
||||
description:
|
||||
"Request permission to guide the user through a task step-by-step with on-screen tooltips. " +
|
||||
"Use this INSTEAD OF request_access when the user wants to LEARN how to do something " +
|
||||
'(phrases like "teach me", "walk me through", "show me how", "help me learn"). ' +
|
||||
"On approval the main Claude window hides and a fullscreen tooltip overlay appears. " +
|
||||
"You then call teach_step repeatedly; each call shows one tooltip and waits for the user to click Next. " +
|
||||
"Same app-allowlist semantics as request_access, but no clipboard/system-key flags. " +
|
||||
"Teach mode ends automatically when your turn ends.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
apps: {
|
||||
type: "array",
|
||||
items: { type: "string" },
|
||||
description:
|
||||
'Application display names (e.g. "Slack", "Calendar") or bundle identifiers. Resolved case-insensitively against installed apps.' +
|
||||
installedAppsHint,
|
||||
},
|
||||
reason: {
|
||||
type: "string",
|
||||
description:
|
||||
'What you will be teaching. Shown in the approval dialog as "Claude wants to guide you through {reason}". Keep it short and task-focused.',
|
||||
},
|
||||
},
|
||||
required: ["apps", "reason"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "teach_step",
|
||||
description:
|
||||
"Show one guided-tour tooltip and wait for the user to click Next. On Next, execute the actions, " +
|
||||
"take a fresh screenshot, and return both — you do NOT need a separate screenshot call between steps. " +
|
||||
"The returned image shows the state after your actions ran; anchor the next teach_step against it. " +
|
||||
"IMPORTANT — the user only sees the tooltip during teach mode. Put ALL narration in `explanation`. " +
|
||||
"Text you emit outside teach_step calls is NOT visible until teach mode ends. " +
|
||||
"Pack as many actions as possible into each step's `actions` array — the user waits through " +
|
||||
"the whole round trip between clicks, so one step that fills a form beats five steps that fill one field each. " +
|
||||
"Returns {exited:true} if the user clicks Exit — do not call teach_step again after that. " +
|
||||
"Take an initial screenshot before your FIRST teach_step to anchor it.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: teachStepProperties,
|
||||
required: ["explanation", "next_preview", "actions"],
|
||||
},
|
||||
},
|
||||
|
||||
{
|
||||
name: "teach_batch",
|
||||
description:
|
||||
"Queue multiple teach steps in one tool call. Parallels computer_batch: " +
|
||||
"N steps → one model↔API round trip instead of N. Each step still shows a tooltip " +
|
||||
"and waits for the user's Next click, but YOU aren't waiting for a round trip between steps. " +
|
||||
"You can call teach_batch multiple times in one tour — treat each batch as one predictable " +
|
||||
"SEGMENT (typically: all the steps on one page). The returned screenshot shows the state " +
|
||||
"after the batch's final actions; anchor the NEXT teach_batch against it. " +
|
||||
"WITHIN a batch, all anchors and click coordinates refer to the PRE-BATCH screenshot " +
|
||||
"(same invariant as computer_batch) — for steps 2+ in a batch, either omit anchor " +
|
||||
"(centered tooltip) or target elements you know won't have moved. " +
|
||||
"Good pattern: batch 5 tooltips on page A (last step navigates) → read returned screenshot → " +
|
||||
"batch 3 tooltips on page B → done. " +
|
||||
"Returns {exited:true, stepsCompleted:N} if the user clicks Exit — do NOT call again after that; " +
|
||||
"{stepsCompleted, stepFailed, ...} if an action errors mid-batch; " +
|
||||
"otherwise {stepsCompleted, results:[...]} plus a final screenshot. " +
|
||||
"Fall back to individual teach_step calls when you need to react to each intermediate screenshot.",
|
||||
inputSchema: {
|
||||
type: "object" as const,
|
||||
properties: {
|
||||
steps: {
|
||||
type: "array",
|
||||
minItems: 1,
|
||||
items: {
|
||||
type: "object",
|
||||
properties: teachStepProperties,
|
||||
required: ["explanation", "next_preview", "actions"],
|
||||
},
|
||||
description:
|
||||
"Ordered steps. Validated upfront — a typo in step 5 errors before any tooltip shows.",
|
||||
},
|
||||
},
|
||||
required: ["steps"],
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
@@ -1,70 +1,622 @@
|
||||
/**
|
||||
* @ant/computer-use-mcp — Types
|
||||
*
|
||||
* 从调用侧反推的真实类型定义,替代 any stub。
|
||||
*/
|
||||
import type {
|
||||
ComputerExecutor,
|
||||
InstalledApp,
|
||||
ScreenshotResult,
|
||||
} from "./executor.js";
|
||||
|
||||
export type CoordinateMode = 'pixels' | 'normalized'
|
||||
|
||||
export interface CuSubGates {
|
||||
pixelValidation: boolean
|
||||
clipboardPasteMultiline: boolean
|
||||
mouseAnimation: boolean
|
||||
hideBeforeAction: boolean
|
||||
autoTargetDisplay: boolean
|
||||
clipboardGuard: boolean
|
||||
}
|
||||
/** `ScreenshotResult` without the base64 blob. The shape hosts persist for
|
||||
* cross-respawn `scaleCoord` survival. */
|
||||
export type ScreenshotDims = Omit<ScreenshotResult, "base64">;
|
||||
|
||||
/** Shape mirrors claude-for-chrome-mcp/src/types.ts:1-7 */
|
||||
export interface Logger {
|
||||
silly(message: string, ...args: unknown[]): void
|
||||
debug(message: string, ...args: unknown[]): void
|
||||
info(message: string, ...args: unknown[]): void
|
||||
warn(message: string, ...args: unknown[]): void
|
||||
error(message: string, ...args: unknown[]): void
|
||||
info: (message: string, ...args: unknown[]) => void;
|
||||
error: (message: string, ...args: unknown[]) => void;
|
||||
warn: (message: string, ...args: unknown[]) => void;
|
||||
debug: (message: string, ...args: unknown[]) => void;
|
||||
silly: (message: string, ...args: unknown[]) => void;
|
||||
}
|
||||
|
||||
export interface CuPermissionRequest {
|
||||
apps: Array<{ bundleId: string; displayName: string }>
|
||||
requestedFlags: GrantFlags
|
||||
reason: string
|
||||
tccState: { accessibility: boolean; screenRecording: boolean }
|
||||
willHide: string[]
|
||||
/**
|
||||
* Per-app permission tier. Hardcoded by category at grant time — the
|
||||
* approval dialog displays the tier but the user cannot change it (for now).
|
||||
*
|
||||
* - `"read"` — visible in screenshots, NO interaction (no clicks, no typing).
|
||||
* Browsers land here: the model can read a page that's already open, but
|
||||
* must use the Claude-in-Chrome MCP for any navigation/clicking. Trading
|
||||
* platforms land here too (no CiC alternative — the model asks the user).
|
||||
* - `"click"` — visible + plain left-click, scroll. NO typing/keys,
|
||||
* NO right/middle-click, NO modifier-clicks, NO drag-drop (all text-
|
||||
* injection vectors). Terminals/IDEs land here: the model can click a
|
||||
* Run button or scroll test output, but `type("rm -rf /")` is blocked
|
||||
* and so is right-click→Paste and dragging text onto the terminal.
|
||||
* - `"full"` — visible + click + type/key/paste. Everything else.
|
||||
*
|
||||
* Enforced in `runInputActionGates` via the frontmost-app check: keyboard
|
||||
* actions require `"full"`, mouse actions require `"click"` or higher.
|
||||
*/
|
||||
export type CuAppPermTier = "read" | "click" | "full";
|
||||
|
||||
/**
|
||||
* A single app the user has approved for the current session. Session-scoped
|
||||
* only — there is no "once" or "forever" scope (unlike Chrome's per-domain
|
||||
* three-way). CU has no natural "once" unit; one task = hundreds of clicks.
|
||||
* Mirrors how `chromeAllowedDomains` is a plain `string[]` with no per-item
|
||||
* scope.
|
||||
*/
|
||||
export interface AppGrant {
|
||||
bundleId: string;
|
||||
displayName: string;
|
||||
/** Epoch ms. For Settings-page display ("Granted 3m ago"). */
|
||||
grantedAt: number;
|
||||
/** Undefined → `"full"` (back-compat for pre-tier grants persisted in
|
||||
* session state). */
|
||||
tier?: CuAppPermTier;
|
||||
}
|
||||
|
||||
export interface GrantFlags {
|
||||
clipboardRead: boolean
|
||||
clipboardWrite: boolean
|
||||
systemKeyCombos: boolean
|
||||
/** Orthogonal to the app allowlist. */
|
||||
export interface CuGrantFlags {
|
||||
clipboardRead: boolean;
|
||||
clipboardWrite: boolean;
|
||||
/**
|
||||
* When false, the `key` tool rejects combos in `keyBlocklist.ts`
|
||||
* (cmd+q, cmd+tab, cmd+space, cmd+shift+q, ctrl+alt+delete). All other
|
||||
* key sequences work regardless.
|
||||
*/
|
||||
systemKeyCombos: boolean;
|
||||
}
|
||||
|
||||
export interface CuPermissionResponse {
|
||||
granted: string[]
|
||||
denied: string[]
|
||||
flags: GrantFlags
|
||||
}
|
||||
|
||||
export const DEFAULT_GRANT_FLAGS: GrantFlags = {
|
||||
export const DEFAULT_GRANT_FLAGS: CuGrantFlags = {
|
||||
clipboardRead: false,
|
||||
clipboardWrite: false,
|
||||
systemKeyCombos: false,
|
||||
};
|
||||
|
||||
/**
|
||||
* Host picks via GrowthBook JSON feature `chicago_coordinate_mode`, baked
|
||||
* into tool param descriptions at server-construction time. The model sees
|
||||
* ONE convention and never learns the other exists. `normalized_0_100`
|
||||
* sidesteps the Retina scaleFactor bug class entirely.
|
||||
*/
|
||||
export type CoordinateMode = "pixels" | "normalized_0_100";
|
||||
|
||||
/**
|
||||
* Independent kill switches for subtle/risky ported behaviors. Read from
|
||||
* GrowthBook by the host adapter, consulted in `toolCalls.ts`.
|
||||
*/
|
||||
export interface CuSubGates {
|
||||
/** 9×9 exact-byte staleness guard before click. */
|
||||
pixelValidation: boolean;
|
||||
/** Route `type("foo\nbar")` through clipboard instead of keystroke-by-keystroke. */
|
||||
clipboardPasteMultiline: boolean;
|
||||
/**
|
||||
* Ease-out-cubic mouse glide at 60fps, distance-proportional duration
|
||||
* (2000 px/sec, capped at 0.5s). Adds up to ~0.5s latency
|
||||
* per click. When off, cursor teleports instantly.
|
||||
*/
|
||||
mouseAnimation: boolean;
|
||||
/**
|
||||
* Pre-action sequence: hide non-allowlisted apps, then defocus us (from the
|
||||
* Vercept acquisition). When off, the
|
||||
* frontmost gate fires in the normal case and the model gets stuck — this
|
||||
* is the A/B-test-the-old-broken-behavior switch.
|
||||
*/
|
||||
hideBeforeAction: boolean;
|
||||
/**
|
||||
* Auto-resolve the target display before each screenshot when the
|
||||
* selected display has no allowed-app windows. When on, `handleScreenshot`
|
||||
* uses the atomic Swift path; off → sticks with `selectedDisplayId`.
|
||||
*/
|
||||
autoTargetDisplay: boolean;
|
||||
/**
|
||||
* Stash+clear the clipboard while a tier-"click" app is frontmost.
|
||||
* Closes the gap where a click-tier terminal/IDE has a UI Paste button
|
||||
* that's plain-left-clickable — without this, the tier "click"
|
||||
* keyboard block can be routed around by clicking Paste. Restored when
|
||||
* a non-"click" app becomes frontmost, or at turn end.
|
||||
*/
|
||||
clipboardGuard: boolean;
|
||||
}
|
||||
|
||||
export interface ComputerUseConfig {
|
||||
coordinateMode: CoordinateMode
|
||||
enabledTools: string[]
|
||||
// ----------------------------------------------------------------------------
|
||||
// Permission request/response (mirror of BridgePermissionRequest, types.ts:77-94)
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/** One entry per app the model asked for, after name → bundle ID resolution. */
|
||||
export interface ResolvedAppRequest {
|
||||
/** What the model asked for (e.g. "Slack", "com.tinyspeck.slackmacgap"). */
|
||||
requestedName: string;
|
||||
/** The resolved InstalledApp if found, else undefined (shown greyed in the UI). */
|
||||
resolved?: InstalledApp;
|
||||
/** Shell-access-equivalent bundle IDs get a UI warning. See sentinelApps.ts. */
|
||||
isSentinel: boolean;
|
||||
/** Already in the allowlist → skip the checkbox, return in `granted` immediately. */
|
||||
alreadyGranted: boolean;
|
||||
/** Hardcoded tier for this app (browser→"read", terminal→"click", else "full").
|
||||
* The dialog displays this read-only; the renderer passes it through
|
||||
* verbatim in the AppGrant. */
|
||||
proposedTier: CuAppPermTier;
|
||||
}
|
||||
|
||||
/**
|
||||
* Payload for the renderer approval dialog. Rides through the existing
|
||||
* `ToolPermissionRequest.input: unknown` field
|
||||
* (packages/utils/desktop/bridge/common/claude.web.ts:1262) — no IPC schema
|
||||
* change needed.
|
||||
*/
|
||||
export interface CuPermissionRequest {
|
||||
requestId: string;
|
||||
/** Model-provided reason string. Shown prominently in the approval UI. */
|
||||
reason: string;
|
||||
apps: ResolvedAppRequest[];
|
||||
/** What the model asked for. User can toggle independently of apps. */
|
||||
requestedFlags: Partial<CuGrantFlags>;
|
||||
/**
|
||||
* For the "On Windows, Claude can see all apps..." footnote. Taken from
|
||||
* `executor.capabilities.screenshotFiltering` so the renderer doesn't
|
||||
* need to know about platforms.
|
||||
*/
|
||||
screenshotFiltering: "native" | "none";
|
||||
/**
|
||||
* Present only when TCC permissions are NOT yet granted. When present,
|
||||
* the renderer shows a TCC toggle panel (two rows: Accessibility, Screen
|
||||
* Recording) INSTEAD OF the app list. Clicking a row's "Request" button
|
||||
* triggers the OS prompt; the store polls on window-focus and flips the
|
||||
* toggle when the grant is detected. macOS itself prompts the user to
|
||||
* restart after granting Screen Recording — we don't.
|
||||
*/
|
||||
tccState?: {
|
||||
accessibility: boolean;
|
||||
screenRecording: boolean;
|
||||
};
|
||||
/**
|
||||
* Apps with windows on the CU display that aren't in the requested
|
||||
* allowlist. These will be hidden the first time Claude takes an action.
|
||||
* Computed at request_access time — may be slightly stale by the time the
|
||||
* user clicks Allow, but it's a preview, not a contract. Absent when
|
||||
* empty so the renderer can skip the section cleanly.
|
||||
*/
|
||||
willHide?: Array<{ bundleId: string; displayName: string }>;
|
||||
/**
|
||||
* `chicagoAutoUnhide` app preference at request time. The renderer picks
|
||||
* between "...then restored when Claude is done" and "...will be hidden"
|
||||
* copy. Absent when `willHide` is absent (same condition).
|
||||
*/
|
||||
autoUnhideEnabled?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
* What the renderer stuffs into `updatedInput._cuGrants` when the user clicks
|
||||
* "Allow for this session" (mirror of the `_allowAllSites` sentinel at
|
||||
* LocalAgentModeSessionManager.ts:2794).
|
||||
*/
|
||||
export interface CuPermissionResponse {
|
||||
granted: AppGrant[];
|
||||
/** Bundle IDs the user unchecked, or apps that weren't installed. */
|
||||
denied: Array<{ bundleId: string; reason: "user_denied" | "not_installed" }>;
|
||||
flags: CuGrantFlags;
|
||||
/**
|
||||
* Whether the user clicked Allow in THIS dialog. Only set by the
|
||||
* teach-mode handler — regular request_access doesn't need it (the
|
||||
* session manager's `result.behavior` gates the merge there). Needed
|
||||
* because when all requested apps are already granted (skipDialogGrants
|
||||
* non-empty, needDialog empty), Allow and Deny produce identical
|
||||
* `{granted:[], denied:[]}` payloads and the tool handler can't tell
|
||||
* them apart without this. Undefined → legacy/regular path, do not
|
||||
* gate on it.
|
||||
*/
|
||||
userConsented?: boolean;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Host adapter (mirror of ClaudeForChromeContext, types.ts:33-62)
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Process-lifetime singleton dependencies. Everything that does NOT vary per
|
||||
* tool call. Built once by `apps/desktop/src/main/nest-only/chicago/hostAdapter.ts`.
|
||||
* No Electron imports in this package — the host injects everything.
|
||||
*/
|
||||
export interface ComputerUseHostAdapter {
|
||||
serverName: string
|
||||
logger: Logger
|
||||
executor: ComputerExecutor
|
||||
ensureOsPermissions(): Promise<{ granted: true } | { granted: false; accessibility: boolean; screenRecording: boolean }>
|
||||
isDisabled(): boolean
|
||||
getSubGates(): CuSubGates
|
||||
getAutoUnhideEnabled(): boolean
|
||||
cropRawPatch?(base64: string, x: number, y: number, w: number, h: number): Promise<string>
|
||||
serverName: string;
|
||||
logger: Logger;
|
||||
executor: ComputerExecutor;
|
||||
|
||||
/**
|
||||
* TCC state check — Accessibility + Screen Recording on macOS. Pure check,
|
||||
* no dialog, no relaunch. When either is missing, `request_access` threads
|
||||
* the state through to the renderer which shows a toggle panel; all other
|
||||
* tools return a tool error.
|
||||
*/
|
||||
ensureOsPermissions(): Promise<
|
||||
| { granted: true }
|
||||
| { granted: false; accessibility: boolean; screenRecording: boolean }
|
||||
>;
|
||||
|
||||
/** The Settings-page kill switch (`chicagoEnabled` app preference). */
|
||||
isDisabled(): boolean;
|
||||
|
||||
/**
|
||||
* The `chicagoAutoUnhide` app preference. Consumed by `buildAccessRequest`
|
||||
* to populate `CuPermissionRequest.autoUnhideEnabled` so the renderer's
|
||||
* "will be hidden" copy can say "then restored" only when true.
|
||||
*/
|
||||
getAutoUnhideEnabled(): boolean;
|
||||
|
||||
/**
|
||||
* Sub-gates re-read on every tool call so GrowthBook flips take effect
|
||||
* mid-session without restart.
|
||||
*/
|
||||
getSubGates(): CuSubGates;
|
||||
|
||||
/**
|
||||
* JPEG decode + crop + raw pixel bytes, for the PixelCompare staleness guard.
|
||||
* Injected so this package stays Electron-free. The host implements it via
|
||||
* `nativeImage.createFromBuffer(jpeg).crop(rect).toBitmap()` — Chromium's
|
||||
* decoders, BSD-licensed, no `.node` binary.
|
||||
*
|
||||
* Returns null on decode/crop failure — caller treats null as `skipped`,
|
||||
* click proceeds (validation failure must never block the action).
|
||||
*/
|
||||
cropRawPatch(
|
||||
jpegBase64: string,
|
||||
rect: { x: number; y: number; width: number; height: number },
|
||||
): Buffer | null;
|
||||
}
|
||||
|
||||
export interface ComputerExecutor {
|
||||
capabilities: Record<string, boolean>
|
||||
// ----------------------------------------------------------------------------
|
||||
// Session context (getter/callback bag for bindSessionContext)
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Per-session state binding for `bindSessionContext`. Hosts build this once
|
||||
* per session with getters that read fresh from their session store and
|
||||
* callbacks that write back. The returned dispatcher builds
|
||||
* `ComputerUseOverrides` from these getters on every call.
|
||||
*
|
||||
* Callbacks must be set at construction time — `bindSessionContext` reads
|
||||
* them once at bind, not per call.
|
||||
*
|
||||
* The lock hooks are **async** — `bindSessionContext` awaits them before
|
||||
* `handleToolCall`, then passes `checkCuLock: undefined` in overrides so the
|
||||
* sync Gate-3 in `handleToolCall` no-ops. Hosts with in-memory sync locks
|
||||
* (Cowork) wrap them trivially; hosts with cross-process locks (the CLI's
|
||||
* O_EXCL file) call the real async primitive directly.
|
||||
*/
|
||||
export interface ComputerUseSessionContext {
|
||||
// ── Read state fresh per call ──────────────────────────────────────
|
||||
|
||||
getAllowedApps(): readonly AppGrant[];
|
||||
getGrantFlags(): CuGrantFlags;
|
||||
/** Per-user auto-deny list (Settings page). Empty array = none. */
|
||||
getUserDeniedBundleIds(): readonly string[];
|
||||
getSelectedDisplayId(): number | undefined;
|
||||
getDisplayPinnedByModel?(): boolean;
|
||||
getDisplayResolvedForApps?(): string | undefined;
|
||||
getTeachModeActive?(): boolean;
|
||||
/** Dims-only fallback when `lastScreenshot` is unset (cross-respawn).
|
||||
* `bindSessionContext` reconstructs `{...dims, base64: ""}` so scaleCoord
|
||||
* works and pixelCompare correctly skips. */
|
||||
getLastScreenshotDims?(): ScreenshotDims | undefined;
|
||||
|
||||
// ── Write-back callbacks ───────────────────────────────────────────
|
||||
|
||||
/** Shows the approval dialog. Host routes to its UI, awaits user. The
|
||||
* signal is aborted if the tool call finishes before the user answers
|
||||
* (MCP timeout, etc.) — hosts dismiss the dialog on abort. */
|
||||
onPermissionRequest?(
|
||||
req: CuPermissionRequest,
|
||||
signal: AbortSignal,
|
||||
): Promise<CuPermissionResponse>;
|
||||
/** Teach-mode sibling of `onPermissionRequest`. */
|
||||
onTeachPermissionRequest?(
|
||||
req: CuTeachPermissionRequest,
|
||||
signal: AbortSignal,
|
||||
): Promise<CuPermissionResponse>;
|
||||
/** Called by `bindSessionContext` after merging a permission response into
|
||||
* the allowlist (dedupe on bundleId, truthy-only flag spread). Host
|
||||
* persists for resume survival. */
|
||||
onAllowedAppsChanged?(apps: readonly AppGrant[], flags: CuGrantFlags): void;
|
||||
onAppsHidden?(bundleIds: string[]): void;
|
||||
/** Reads the session's clipboardGuard stash. undefined → no stash held. */
|
||||
getClipboardStash?(): string | undefined;
|
||||
/** Writes the clipboardGuard stash. undefined clears it. */
|
||||
onClipboardStashChanged?(stash: string | undefined): void;
|
||||
onResolvedDisplayUpdated?(displayId: number): void;
|
||||
onDisplayPinned?(displayId: number | undefined): void;
|
||||
onDisplayResolvedForApps?(sortedBundleIdsKey: string): void;
|
||||
/** Called after each screenshot. Host persists for respawn survival. */
|
||||
onScreenshotCaptured?(dims: ScreenshotDims): void;
|
||||
onTeachModeActivated?(): void;
|
||||
onTeachStep?(req: TeachStepRequest): Promise<TeachStepResult>;
|
||||
onTeachWorking?(): void;
|
||||
|
||||
// ── Lock (async) ───────────────────────────────────────────────────
|
||||
|
||||
/** At most one session uses CU at a time. Awaited by `bindSessionContext`
|
||||
* before dispatch. Undefined → no lock gating (proceed). */
|
||||
checkCuLock?(): Promise<{ holder: string | undefined; isSelf: boolean }>;
|
||||
/** Take the lock. Called when `checkCuLock` returned `holder: undefined`
|
||||
* on a non-deferring tool. Host emits enter-CU signals here. */
|
||||
acquireCuLock?(): Promise<void>;
|
||||
/** Host-specific lock-held error text. Default is the package's generic
|
||||
* message. The CLI host includes the holder session-ID prefix. */
|
||||
formatLockHeldMessage?(holder: string): string;
|
||||
|
||||
/** User-abort signal. Passed through to `ComputerUseOverrides.isAborted`
|
||||
* for the mid-loop checks in handleComputerBatch / handleType. See that
|
||||
* field for semantics. */
|
||||
isAborted?(): boolean;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Per-call overrides (mirror of PermissionOverrides, types.ts:97-102)
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Built FRESH on every tool call by `bindSessionContext` from
|
||||
* `ComputerUseSessionContext` getters. This is what lets a singleton MCP
|
||||
* server carry per-session state — the state lives on the host's session
|
||||
* store, not the server.
|
||||
*/
|
||||
export interface ComputerUseOverrides {
|
||||
allowedApps: AppGrant[];
|
||||
grantFlags: CuGrantFlags;
|
||||
coordinateMode: CoordinateMode;
|
||||
|
||||
/**
|
||||
* User-configured auto-deny list (Settings → Desktop app → Computer Use).
|
||||
* Bundle IDs
|
||||
* here are stripped from request_access BEFORE the approval dialog — they
|
||||
* never reach the user for approval regardless of tier. The response tells
|
||||
* the agent to ask the user to remove the app from their deny list in
|
||||
* Settings if access is genuinely needed.
|
||||
*
|
||||
* Per-USER, persists across restarts (read from appPreferences per call,
|
||||
* not session state). Contrast with `allowedApps` which is per-session.
|
||||
* Empty array = no user-configured denies (the default).
|
||||
*/
|
||||
userDeniedBundleIds: readonly string[];
|
||||
|
||||
/**
|
||||
* Display CU operates on; read fresh per call. `scaleCoord` uses the
|
||||
* `originX/Y` snapshotted in `lastScreenshot`, so mid-session switches
|
||||
* only affect the NEXT screenshot/prepare call.
|
||||
*/
|
||||
selectedDisplayId?: number;
|
||||
|
||||
/**
|
||||
* The `request_access` tool handler calls this and awaits. The wrapper
|
||||
* closure in serverDef.ts (mirroring InternalMcpServerManager.ts:131-177)
|
||||
* routes through `handleToolPermission` → IPC → renderer ChicagoApproval.
|
||||
* When it resolves, the wrapper side-effectfully mutates
|
||||
* `InternalServerContext.cuAllowedApps` BEFORE returning here.
|
||||
*
|
||||
* Undefined when the session wasn't wired with a permission handler (e.g.
|
||||
* a future headless mode). `request_access` returns a tool error in that case.
|
||||
*/
|
||||
onPermissionRequest?: (req: CuPermissionRequest) => Promise<CuPermissionResponse>;
|
||||
|
||||
/**
|
||||
* For the pixel-validation staleness guard. The model's-last-screenshot,
|
||||
* stashed by serverDef.ts after each `screenshot` tool call. Undefined on
|
||||
* cold start → pixel validation skipped (click proceeds).
|
||||
*/
|
||||
lastScreenshot?: ScreenshotResult;
|
||||
|
||||
/**
|
||||
* Fired after every `prepareForAction` with the bundle IDs it just hid.
|
||||
* The wrapper closure in serverDef.ts accumulates these into
|
||||
* `Session.cuHiddenDuringTurn` via a write-through callback (same pattern
|
||||
* as `onCuPermissionUpdated`). At turn end (`sdkMessage.type === "result"`),
|
||||
* if the `chicagoAutoUnhide` setting is on, everything in the set is
|
||||
* unhidden. Set is cleared regardless of the setting so it doesn't leak
|
||||
* across turns.
|
||||
*
|
||||
* Undefined when the session wasn't wired with a tracker — unhide just
|
||||
* doesn't happen.
|
||||
*/
|
||||
onAppsHidden?: (bundleIds: string[]) => void;
|
||||
|
||||
/**
|
||||
* Reads the clipboardGuard stash from session state. `undefined` means no
|
||||
* stash is held — `syncClipboardStash` stashes on first entry to click-tier
|
||||
* and clears on restore. Sibling of the `cuHiddenDuringTurn` getter pattern
|
||||
* — state lives on the host's session, not module-level here.
|
||||
*/
|
||||
getClipboardStash?: () => string | undefined;
|
||||
|
||||
/**
|
||||
* Writes the clipboardGuard stash to session state. `undefined` clears.
|
||||
* Sibling of `onAppsHidden` — the wrapper closure writes through to
|
||||
* `Session.cuClipboardStash`. At turn end the host reads + clears it
|
||||
* directly and restores via Electron's `clipboard.writeText` (no nest-only
|
||||
* import surface).
|
||||
*/
|
||||
onClipboardStashChanged?: (stash: string | undefined) => void;
|
||||
|
||||
/**
|
||||
* Write the resolver's picked display back to session so teach overlay
|
||||
* positioning and subsequent non-resolver calls use the same display.
|
||||
* Fired by `handleScreenshot` in the atomic `autoTargetDisplay` path when
|
||||
* `resolvePrepareCapture`'s pick differs from `selectedDisplayId`.
|
||||
* Fire-and-forget.
|
||||
*/
|
||||
onResolvedDisplayUpdated?: (displayId: number) => void;
|
||||
|
||||
/**
|
||||
* Set when the model explicitly picked a display via `switch_display`.
|
||||
* When true, `handleScreenshot` passes `autoResolve: false` so the Swift
|
||||
* resolver honors `selectedDisplayId` directly (straight cuDisplayInfo
|
||||
* passthrough) instead of running the co-location/chase chain. The
|
||||
* resolver's Step 2 ("host + allowed co-located → host") otherwise
|
||||
* overrides any `selectedDisplayId` whenever an allowed app shares the
|
||||
* host's monitor.
|
||||
*/
|
||||
displayPinnedByModel?: boolean;
|
||||
|
||||
/**
|
||||
* Write the model's explicit display pick to session. `displayId:
|
||||
* undefined` clears both `selectedDisplayId` and the pin (back to auto).
|
||||
* Sibling of `onResolvedDisplayUpdated` but also sets the pin flag —
|
||||
* the two are semantically distinct (resolver-picked vs model-picked).
|
||||
*/
|
||||
onDisplayPinned?: (displayId: number | undefined) => void;
|
||||
|
||||
/**
|
||||
* Sorted comma-joined bundle-ID set the display was last auto-resolved
|
||||
* for. `handleScreenshot` compares this to the current allowed set and
|
||||
* only passes `autoResolve: true` when they differ — so the resolver
|
||||
* doesn't yank the display on every screenshot, only when the app set
|
||||
* has changed since the last resolve (or manual switch).
|
||||
*/
|
||||
displayResolvedForApps?: string;
|
||||
|
||||
/**
|
||||
* Records which app set the current display selection was made for. Fired
|
||||
* alongside `onResolvedDisplayUpdated` when the resolver picks, so the next
|
||||
* screenshot sees a matching set and skips auto-resolve.
|
||||
*/
|
||||
onDisplayResolvedForApps?: (sortedBundleIdsKey: string) => void;
|
||||
|
||||
/**
|
||||
* Global CU lock — at most one session actively uses CU at a time. Checked
|
||||
* in `handleToolCall` after kill-switch/TCC, before dispatch. Every CU tool
|
||||
* including `request_access` goes through it.
|
||||
*
|
||||
* - `holder === undefined` → lock is free, safe to acquire
|
||||
* - `isSelf === true` → this session already holds it (no-op, proceed)
|
||||
* - `holder !== undefined && !isSelf` → blocked, return tool error
|
||||
*
|
||||
* `undefined` callback → lock system not wired (e.g. CCD). Proceed without
|
||||
* gating — absence of the mechanism ≠ locked out.
|
||||
*
|
||||
* The host manages release (on session idle/stop/archive) — this package
|
||||
* never releases.
|
||||
*/
|
||||
checkCuLock?: () => { holder: string | undefined; isSelf: boolean };
|
||||
|
||||
/**
|
||||
* Take the lock for this session. `handleToolCall` calls this exactly once
|
||||
* per turn, on the FIRST CU tool call when `checkCuLock().holder` is
|
||||
* undefined. No-op if already held (defensive — the check should have
|
||||
* short-circuited). Host emits an event the overlay listens to.
|
||||
*/
|
||||
acquireCuLock?: () => void;
|
||||
|
||||
/**
|
||||
* User-abort signal. Checked mid-iteration inside `handleComputerBatch`
|
||||
* and `handleType`'s grapheme loop so an in-flight batch/type stops
|
||||
* promptly on overlay Stop instead of running to completion after the
|
||||
* host has already abandoned the tool result.
|
||||
*
|
||||
* Undefined → never aborts (e.g. unwired host). Live per-check read —
|
||||
* same lazy-getter pattern as `checkCuLock`.
|
||||
*/
|
||||
isAborted?: () => boolean;
|
||||
|
||||
// ── Teach mode ───────────────────────────────────────────────────────
|
||||
// Wired only when the host's teachModeEnabled gate is on. All five
|
||||
// undefined → `request_teach_access` / `teach_step` return tool errors
|
||||
// and teach mode is effectively off.
|
||||
|
||||
/**
|
||||
* Sibling of `onPermissionRequest`. Same blocking-await-on-renderer-dialog
|
||||
* semantics, but routes to ComputerUseTeachApproval.tsx (which explains
|
||||
* the window-hides-during-guide behavior) instead of ComputerUseApproval.
|
||||
* The wrapper closure in serverDef.ts writes grants through to session state
|
||||
* via `onCuPermissionUpdated` exactly as `onPermissionRequest` does.
|
||||
*/
|
||||
onTeachPermissionRequest?: (
|
||||
req: CuTeachPermissionRequest,
|
||||
) => Promise<CuPermissionResponse>;
|
||||
|
||||
/**
|
||||
* Called by `handleRequestTeachAccess` after the user approves and at least
|
||||
* one app was granted. Host sets `session.teachModeActive = true`, emits
|
||||
* `teachModeChanged` → teach controller hides the main window and shows the
|
||||
* fullscreen overlay. Cleared by the host on turn end (`transitionTo("idle")`)
|
||||
* alongside the CU lock release.
|
||||
*/
|
||||
onTeachModeActivated?: () => void;
|
||||
|
||||
/**
|
||||
* Read by `handleRequestAccess` and `handleRequestTeachAccess` to
|
||||
* short-circuit with a clear tool error when teach mode is active. The
|
||||
* main window is hidden during teach mode, so permission dialogs render
|
||||
* invisibly and handleToolPermission blocks forever on an invisible
|
||||
* prompt. Better to tell the model to exit teach mode first. Getter
|
||||
* (not a boolean field) because teach mode state lives on the session,
|
||||
* not on this per-call overrides object.
|
||||
*/
|
||||
getTeachModeActive?: () => boolean;
|
||||
|
||||
/**
|
||||
* Called by `handleTeachStep` with the scaled anchor + text. Host stores
|
||||
* the resolver, emits `teachStepRequested` → teach controller pushes the
|
||||
* payload to the overlay → user reads, clicks Next → IPC → host calls the
|
||||
* stored resolver → this promise resolves. `{action: "exit"}` when the user
|
||||
* clicks Exit (or the turn is interrupted) — `handleTeachStep` short-circuits
|
||||
* without executing actions.
|
||||
*
|
||||
* Same blocking-promise pattern as `onPermissionRequest`, but resolved by
|
||||
* the teach overlay's own preload (not the main renderer's tool-approval UI).
|
||||
*/
|
||||
onTeachStep?: (req: TeachStepRequest) => Promise<TeachStepResult>;
|
||||
|
||||
/**
|
||||
* Called immediately after `onTeachStep` resolves with "next", before
|
||||
* action dispatch begins. Host emits `teachStepWorking` → overlay flips to
|
||||
* the spinner state (Next button gone, Exit stays, "Working…" + rotating
|
||||
* notch). The next `onTeachStep` call replaces the spinner with the new
|
||||
* tooltip content.
|
||||
*/
|
||||
onTeachWorking?: () => void;
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------------
|
||||
// Teach mode (guided-tour tooltips with Next-button action execution)
|
||||
// ----------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Payload the host pushes to the teach overlay BrowserWindow. Built by
|
||||
* `handleTeachStep` in toolCalls.ts from the model's `teach_step` args.
|
||||
*
|
||||
* `anchorLogical` here is POST-`scaleCoord` — **full-display** logical
|
||||
* macOS points (origin = monitor top-left, menu bar included, since
|
||||
* cuDisplayInfo returns CGDisplayBounds). The overlay window is positioned
|
||||
* at `workArea.{x,y}` (excludes menu bar/Dock), so `updateTeachStep` in
|
||||
* teach/window.ts subtracts the workArea offset before IPC so the HTML's
|
||||
* CSS coords match.
|
||||
*/
|
||||
export interface TeachStepRequest {
|
||||
explanation: string;
|
||||
nextPreview: string;
|
||||
/** Full-display logical points. Undefined → overlay centers the tooltip, hides the arrow. */
|
||||
anchorLogical?: { x: number; y: number };
|
||||
}
|
||||
|
||||
export type TeachStepResult = { action: "next" } | { action: "exit" };
|
||||
|
||||
/**
|
||||
* Payload for the renderer's ComputerUseTeachApproval dialog. Rides through
|
||||
* `ToolPermissionRequest.input: unknown` same as `CuPermissionRequest`.
|
||||
* Separate type (not a flag on `CuPermissionRequest`) so the two approval
|
||||
* components can narrow independently and the teach dialog is free to drop
|
||||
* fields it doesn't render (no grant-flag checkboxes in teach mode).
|
||||
*/
|
||||
export interface CuTeachPermissionRequest {
|
||||
requestId: string;
|
||||
/** Model-provided reason. Shown in the dialog headline ("guide you through {reason}"). */
|
||||
reason: string;
|
||||
apps: ResolvedAppRequest[];
|
||||
screenshotFiltering: "native" | "none";
|
||||
/** Present only when TCC is ungranted — same semantics as `CuPermissionRequest.tccState`. */
|
||||
tccState?: {
|
||||
accessibility: boolean;
|
||||
screenRecording: boolean;
|
||||
};
|
||||
willHide?: Array<{ bundleId: string; displayName: string }>;
|
||||
/** Same semantics as `CuPermissionRequest.autoUnhideEnabled`. */
|
||||
autoUnhideEnabled?: boolean;
|
||||
}
|
||||
|
||||
258
packages/@ant/computer-use-swift/src/backends/darwin.ts
Normal file
258
packages/@ant/computer-use-swift/src/backends/darwin.ts
Normal file
@@ -0,0 +1,258 @@
|
||||
/**
|
||||
* macOS backend for computer-use-swift
|
||||
*
|
||||
* Uses AppleScript/JXA/screencapture for display info, app management,
|
||||
* and screenshots.
|
||||
*/
|
||||
|
||||
import { readFileSync, unlinkSync } from 'fs'
|
||||
import { tmpdir } from 'os'
|
||||
import { join } from 'path'
|
||||
import type {
|
||||
AppInfo, AppsAPI, DisplayAPI, DisplayGeometry, InstalledApp,
|
||||
PrepareDisplayResult, RunningApp, ScreenshotAPI, ScreenshotResult,
|
||||
SwiftBackend, WindowDisplayInfo,
|
||||
} from '../types.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function jxaSync(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['osascript', '-l', 'JavaScript', '-e', script],
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
function osascriptSync(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['osascript', '-e', script],
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
async function osascript(script: string): Promise<string> {
|
||||
const proc = Bun.spawn(['osascript', '-e', script], {
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
const text = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return text.trim()
|
||||
}
|
||||
|
||||
async function jxa(script: string): Promise<string> {
|
||||
const proc = Bun.spawn(['osascript', '-l', 'JavaScript', '-e', script], {
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
const text = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return text.trim()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DisplayAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const display: DisplayAPI = {
|
||||
getSize(displayId?: number): DisplayGeometry {
|
||||
const all = this.listAll()
|
||||
if (displayId !== undefined) {
|
||||
const found = all.find(d => d.displayId === displayId)
|
||||
if (found) return found
|
||||
}
|
||||
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }
|
||||
},
|
||||
|
||||
listAll(): DisplayGeometry[] {
|
||||
try {
|
||||
const raw = jxaSync(`
|
||||
ObjC.import("CoreGraphics");
|
||||
var displays = $.CGDisplayCopyAllDisplayModes ? [] : [];
|
||||
var active = $.CGGetActiveDisplayList(10, null, Ref());
|
||||
var countRef = Ref();
|
||||
$.CGGetActiveDisplayList(0, null, countRef);
|
||||
var count = countRef[0];
|
||||
var idBuf = Ref();
|
||||
$.CGGetActiveDisplayList(count, idBuf, countRef);
|
||||
var result = [];
|
||||
for (var i = 0; i < count; i++) {
|
||||
var did = idBuf[i];
|
||||
var w = $.CGDisplayPixelsWide(did);
|
||||
var h = $.CGDisplayPixelsHigh(did);
|
||||
var mode = $.CGDisplayCopyDisplayMode(did);
|
||||
var pw = $.CGDisplayModeGetPixelWidth(mode);
|
||||
var sf = pw > 0 && w > 0 ? pw / w : 2;
|
||||
result.push({width: w, height: h, scaleFactor: sf, displayId: did});
|
||||
}
|
||||
JSON.stringify(result);
|
||||
`)
|
||||
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
|
||||
width: Number(d.width), height: Number(d.height),
|
||||
scaleFactor: Number(d.scaleFactor), displayId: Number(d.displayId),
|
||||
}))
|
||||
} catch {
|
||||
try {
|
||||
const raw = jxaSync(`
|
||||
ObjC.import("AppKit");
|
||||
var screens = $.NSScreen.screens;
|
||||
var result = [];
|
||||
for (var i = 0; i < screens.count; i++) {
|
||||
var s = screens.objectAtIndex(i);
|
||||
var frame = s.frame;
|
||||
var desc = s.deviceDescription;
|
||||
var screenNumber = desc.objectForKey($("NSScreenNumber")).intValue;
|
||||
var backingFactor = s.backingScaleFactor;
|
||||
result.push({
|
||||
width: Math.round(frame.size.width),
|
||||
height: Math.round(frame.size.height),
|
||||
scaleFactor: backingFactor,
|
||||
displayId: screenNumber
|
||||
});
|
||||
}
|
||||
JSON.stringify(result);
|
||||
`)
|
||||
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
|
||||
width: Number(d.width), height: Number(d.height),
|
||||
scaleFactor: Number(d.scaleFactor), displayId: Number(d.displayId),
|
||||
}))
|
||||
} catch {
|
||||
return [{ width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }]
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// AppsAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const apps: AppsAPI = {
|
||||
async prepareDisplay(_allowlistBundleIds, _surrogateHost, _displayId) {
|
||||
return { activated: '', hidden: [] }
|
||||
},
|
||||
|
||||
async previewHideSet(_bundleIds, _displayId) {
|
||||
return []
|
||||
},
|
||||
|
||||
async findWindowDisplays(bundleIds) {
|
||||
return bundleIds.map(bundleId => ({ bundleId, displayIds: [1] }))
|
||||
},
|
||||
|
||||
async appUnderPoint(_x, _y) {
|
||||
try {
|
||||
const result = await jxa(`
|
||||
ObjC.import("CoreGraphics");
|
||||
ObjC.import("AppKit");
|
||||
var pt = $.CGPointMake(${_x}, ${_y});
|
||||
var app = $.NSWorkspace.sharedWorkspace.frontmostApplication;
|
||||
JSON.stringify({bundleId: app.bundleIdentifier.js, displayName: app.localizedName.js});
|
||||
`)
|
||||
return JSON.parse(result)
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
},
|
||||
|
||||
async listInstalled() {
|
||||
try {
|
||||
const result = await osascript(`
|
||||
tell application "System Events"
|
||||
set appList to ""
|
||||
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
|
||||
set appPath to POSIX path of (appFile as alias)
|
||||
set appName to name of appFile
|
||||
set appList to appList & appPath & "|" & appName & "\\n"
|
||||
end repeat
|
||||
return appList
|
||||
end tell
|
||||
`)
|
||||
return result.split('\n').filter(Boolean).map(line => {
|
||||
const [path, name] = line.split('|', 2)
|
||||
const displayName = (name ?? '').replace(/\.app$/, '')
|
||||
return {
|
||||
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
|
||||
displayName,
|
||||
path: path ?? '',
|
||||
}
|
||||
})
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
iconDataUrl(_path) {
|
||||
return null
|
||||
},
|
||||
|
||||
listRunning() {
|
||||
try {
|
||||
const raw = jxaSync(`
|
||||
var apps = Application("System Events").applicationProcesses.whose({backgroundOnly: false});
|
||||
var result = [];
|
||||
for (var i = 0; i < apps.length; i++) {
|
||||
try {
|
||||
var a = apps[i];
|
||||
result.push({bundleId: a.bundleIdentifier(), displayName: a.name()});
|
||||
} catch(e) {}
|
||||
}
|
||||
JSON.stringify(result);
|
||||
`)
|
||||
return JSON.parse(raw)
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
async open(bundleId) {
|
||||
await osascript(`tell application id "${bundleId}" to activate`)
|
||||
},
|
||||
|
||||
async unhide(bundleIds) {
|
||||
for (const bundleId of bundleIds) {
|
||||
await osascript(`
|
||||
tell application "System Events"
|
||||
set visible of application process (name of application process whose bundle identifier is "${bundleId}") to true
|
||||
end tell
|
||||
`)
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ScreenshotAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
async function captureScreenToBase64(args: string[]): Promise<{ base64: string; width: number; height: number }> {
|
||||
const tmpFile = join(tmpdir(), `cu-screenshot-${Date.now()}.png`)
|
||||
const proc = Bun.spawn(['screencapture', ...args, tmpFile], {
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
await proc.exited
|
||||
try {
|
||||
const buf = readFileSync(tmpFile)
|
||||
const base64 = buf.toString('base64')
|
||||
const width = buf.readUInt32BE(16)
|
||||
const height = buf.readUInt32BE(20)
|
||||
return { base64, width, height }
|
||||
} finally {
|
||||
try { unlinkSync(tmpFile) } catch {}
|
||||
}
|
||||
}
|
||||
|
||||
export const screenshot: ScreenshotAPI = {
|
||||
async captureExcluding(_allowedBundleIds, _quality, _targetW, _targetH, displayId) {
|
||||
const args = ['-x']
|
||||
if (displayId !== undefined) args.push('-D', String(displayId))
|
||||
return captureScreenToBase64(args)
|
||||
},
|
||||
|
||||
async captureRegion(_allowedBundleIds, x, y, w, h, _outW, _outH, _quality, displayId) {
|
||||
const args = ['-x', '-R', `${x},${y},${w},${h}`]
|
||||
if (displayId !== undefined) args.push('-D', String(displayId))
|
||||
return captureScreenToBase64(args)
|
||||
},
|
||||
}
|
||||
278
packages/@ant/computer-use-swift/src/backends/linux.ts
Normal file
278
packages/@ant/computer-use-swift/src/backends/linux.ts
Normal file
@@ -0,0 +1,278 @@
|
||||
/**
|
||||
* Linux backend for computer-use-swift
|
||||
*
|
||||
* Uses xrandr for display info, scrot for screenshots,
|
||||
* wmctrl/xdotool for window management, and xdg-open for launching apps.
|
||||
*
|
||||
* Requires: xrandr, scrot, xdotool, wmctrl (optional)
|
||||
*/
|
||||
|
||||
import type {
|
||||
AppInfo, AppsAPI, DisplayAPI, DisplayGeometry, InstalledApp,
|
||||
PrepareDisplayResult, RunningApp, ScreenshotAPI, ScreenshotResult,
|
||||
SwiftBackend, WindowDisplayInfo,
|
||||
} from '../types.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Shell helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function run(cmd: string[]): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd,
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
async function runAsync(cmd: string[]): Promise<string> {
|
||||
const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' })
|
||||
const out = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return out.trim()
|
||||
}
|
||||
|
||||
function commandExists(name: string): boolean {
|
||||
const result = Bun.spawnSync({ cmd: ['which', name], stdout: 'pipe', stderr: 'pipe' })
|
||||
return result.exitCode === 0
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DisplayAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const display: DisplayAPI = {
|
||||
getSize(displayId?: number): DisplayGeometry {
|
||||
const all = this.listAll()
|
||||
if (displayId !== undefined) {
|
||||
const found = all.find(d => d.displayId === displayId)
|
||||
if (found) return found
|
||||
}
|
||||
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
|
||||
},
|
||||
|
||||
listAll(): DisplayGeometry[] {
|
||||
try {
|
||||
const raw = run(['xrandr', '--query'])
|
||||
const displays: DisplayGeometry[] = []
|
||||
let idx = 0
|
||||
|
||||
// Match lines like: "HDMI-1 connected 1920x1080+0+0" or "eDP-1 connected primary 2560x1440+0+0"
|
||||
const regex = /^\S+\s+connected\s+(?:primary\s+)?(\d+)x(\d+)\+\d+\+\d+/gm
|
||||
let match: RegExpExecArray | null
|
||||
while ((match = regex.exec(raw)) !== null) {
|
||||
displays.push({
|
||||
width: Number(match[1]),
|
||||
height: Number(match[2]),
|
||||
scaleFactor: 1,
|
||||
displayId: idx++,
|
||||
})
|
||||
}
|
||||
|
||||
if (displays.length === 0) {
|
||||
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
|
||||
}
|
||||
return displays
|
||||
} catch {
|
||||
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// AppsAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const apps: AppsAPI = {
|
||||
async prepareDisplay(_allowlistBundleIds, _surrogateHost, _displayId): Promise<PrepareDisplayResult> {
|
||||
return { activated: '', hidden: [] }
|
||||
},
|
||||
|
||||
async previewHideSet(_bundleIds, _displayId): Promise<AppInfo[]> {
|
||||
return []
|
||||
},
|
||||
|
||||
async findWindowDisplays(bundleIds): Promise<WindowDisplayInfo[]> {
|
||||
return bundleIds.map(bundleId => ({ bundleId, displayIds: [0] }))
|
||||
},
|
||||
|
||||
async appUnderPoint(x, y): Promise<AppInfo | null> {
|
||||
try {
|
||||
// Move mouse to point, get window under cursor
|
||||
const out = run(['xdotool', 'mousemove', '--sync', String(x), String(y), 'getmouselocation', '--shell'])
|
||||
const windowMatch = out.match(/WINDOW=(\d+)/)
|
||||
if (!windowMatch) return null
|
||||
|
||||
const windowId = windowMatch[1]
|
||||
const pidStr = run(['xdotool', 'getwindowpid', windowId!])
|
||||
if (!pidStr) return null
|
||||
|
||||
let exePath = ''
|
||||
try { exePath = run(['readlink', '-f', `/proc/${pidStr}/exe`]) } catch { /* ignore */ }
|
||||
|
||||
let appName = ''
|
||||
try { appName = run(['cat', `/proc/${pidStr}/comm`]) } catch { /* ignore */ }
|
||||
|
||||
if (!exePath && !appName) return null
|
||||
return { bundleId: exePath || pidStr!, displayName: appName || 'unknown' }
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
},
|
||||
|
||||
async listInstalled(): Promise<InstalledApp[]> {
|
||||
try {
|
||||
// Read .desktop files from standard locations
|
||||
const dirs = ['/usr/share/applications', '/usr/local/share/applications', `${process.env.HOME}/.local/share/applications`]
|
||||
const apps: InstalledApp[] = []
|
||||
|
||||
for (const dir of dirs) {
|
||||
let files: string
|
||||
try {
|
||||
files = run(['find', dir, '-name', '*.desktop', '-maxdepth', '1'])
|
||||
} catch { continue }
|
||||
|
||||
for (const filepath of files.split('\n').filter(Boolean)) {
|
||||
try {
|
||||
const content = run(['cat', filepath])
|
||||
const nameMatch = content.match(/^Name=(.+)$/m)
|
||||
const execMatch = content.match(/^Exec=(.+)$/m)
|
||||
const noDisplay = content.match(/^NoDisplay=true$/m)
|
||||
if (noDisplay) continue
|
||||
|
||||
const name = nameMatch?.[1] ?? ''
|
||||
const exec = execMatch?.[1] ?? ''
|
||||
if (!name) continue
|
||||
|
||||
apps.push({
|
||||
bundleId: filepath.split('/').pop()?.replace('.desktop', '') ?? '',
|
||||
displayName: name,
|
||||
path: exec.split(/\s+/)[0] ?? '',
|
||||
})
|
||||
} catch { /* skip unreadable files */ }
|
||||
}
|
||||
}
|
||||
|
||||
return apps.slice(0, 200)
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
iconDataUrl(_path): string | null {
|
||||
return null
|
||||
},
|
||||
|
||||
listRunning(): RunningApp[] {
|
||||
try {
|
||||
// Try wmctrl first
|
||||
if (commandExists('wmctrl')) {
|
||||
const raw = run(['wmctrl', '-l', '-p'])
|
||||
const apps: RunningApp[] = []
|
||||
for (const line of raw.split('\n').filter(Boolean)) {
|
||||
// wmctrl format: "0x04000003 0 12345 hostname Window Title"
|
||||
const parts = line.split(/\s+/)
|
||||
const pid = parts[2]
|
||||
if (!pid || pid === '0') continue
|
||||
|
||||
let exePath = ''
|
||||
try { exePath = run(['readlink', '-f', `/proc/${pid}/exe`]) } catch { /* ignore */ }
|
||||
let appName = ''
|
||||
try { appName = run(['cat', `/proc/${pid}/comm`]) } catch { /* ignore */ }
|
||||
|
||||
if (appName) {
|
||||
apps.push({ bundleId: exePath || pid, displayName: appName })
|
||||
}
|
||||
}
|
||||
// Deduplicate by bundleId
|
||||
const seen = new Set<string>()
|
||||
return apps.filter(a => {
|
||||
if (seen.has(a.bundleId)) return false
|
||||
seen.add(a.bundleId)
|
||||
return true
|
||||
}).slice(0, 50)
|
||||
}
|
||||
|
||||
// Fallback: ps with visible processes
|
||||
const raw = run(['ps', '-eo', 'pid,comm', '--no-headers'])
|
||||
const apps: RunningApp[] = []
|
||||
for (const line of raw.split('\n').filter(Boolean).slice(0, 50)) {
|
||||
const match = line.trim().match(/^(\d+)\s+(.+)$/)
|
||||
if (match) {
|
||||
apps.push({ bundleId: match[1]!, displayName: match[2]! })
|
||||
}
|
||||
}
|
||||
return apps
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
async open(name): Promise<void> {
|
||||
// Try gtk-launch first (for .desktop file names), fall back to xdg-open
|
||||
try {
|
||||
const desktopName = name.endsWith('.desktop') ? name : `${name}.desktop`
|
||||
if (commandExists('gtk-launch')) {
|
||||
await runAsync(['gtk-launch', desktopName])
|
||||
return
|
||||
}
|
||||
} catch { /* fall through */ }
|
||||
|
||||
await runAsync(['xdg-open', name])
|
||||
},
|
||||
|
||||
async unhide(bundleIds): Promise<void> {
|
||||
for (const id of bundleIds) {
|
||||
try {
|
||||
if (commandExists('wmctrl') && id.startsWith('0x')) {
|
||||
// Window ID — use wmctrl
|
||||
await runAsync(['wmctrl', '-i', '-R', id])
|
||||
} else {
|
||||
// Try xdotool windowactivate with search by name
|
||||
await runAsync(['xdotool', 'search', '--name', id, 'windowactivate'])
|
||||
}
|
||||
} catch { /* ignore failures for individual windows */ }
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ScreenshotAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const SCREENSHOT_PATH = '/tmp/cu-screenshot.png'
|
||||
|
||||
export const screenshot: ScreenshotAPI = {
|
||||
async captureExcluding(_allowedBundleIds, _quality, _targetW, _targetH, _displayId): Promise<ScreenshotResult> {
|
||||
try {
|
||||
await runAsync(['scrot', '-o', SCREENSHOT_PATH])
|
||||
|
||||
// Read the file as base64
|
||||
const file = Bun.file(SCREENSHOT_PATH)
|
||||
const buffer = await file.arrayBuffer()
|
||||
const base64 = Buffer.from(buffer).toString('base64')
|
||||
|
||||
// Get dimensions from display info
|
||||
const size = display.getSize(_displayId)
|
||||
return { base64, width: size.width, height: size.height }
|
||||
} catch {
|
||||
return { base64: '', width: 0, height: 0 }
|
||||
}
|
||||
},
|
||||
|
||||
async captureRegion(_allowedBundleIds, x, y, w, h, _outW, _outH, _quality, _displayId): Promise<ScreenshotResult> {
|
||||
try {
|
||||
// scrot -a x,y,w,h captures a specific region
|
||||
await runAsync(['scrot', '-a', `${x},${y},${w},${h}`, '-o', SCREENSHOT_PATH])
|
||||
|
||||
const file = Bun.file(SCREENSHOT_PATH)
|
||||
const buffer = await file.arrayBuffer()
|
||||
const base64 = Buffer.from(buffer).toString('base64')
|
||||
|
||||
return { base64, width: w, height: h }
|
||||
} catch {
|
||||
return { base64: '', width: 0, height: 0 }
|
||||
}
|
||||
},
|
||||
}
|
||||
263
packages/@ant/computer-use-swift/src/backends/win32.ts
Normal file
263
packages/@ant/computer-use-swift/src/backends/win32.ts
Normal file
@@ -0,0 +1,263 @@
|
||||
/**
|
||||
* Windows backend for computer-use-swift
|
||||
*
|
||||
* Uses PowerShell with .NET System.Drawing / System.Windows.Forms for
|
||||
* screenshots and Win32 P/Invoke for window/process management.
|
||||
*/
|
||||
|
||||
import type {
|
||||
AppInfo, AppsAPI, DisplayAPI, DisplayGeometry, InstalledApp,
|
||||
PrepareDisplayResult, RunningApp, ScreenshotAPI, ScreenshotResult,
|
||||
SwiftBackend, WindowDisplayInfo,
|
||||
} from '../types.js'
|
||||
|
||||
import { listWindows } from 'src/utils/computerUse/win32/windowEnum.js'
|
||||
import { captureWindow, captureWindowByHwnd } from 'src/utils/computerUse/win32/windowCapture.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// PowerShell helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function ps(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
async function psAsync(script: string): Promise<string> {
|
||||
const proc = Bun.spawn(
|
||||
['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
{ stdout: 'pipe', stderr: 'pipe' },
|
||||
)
|
||||
const out = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return out.trim()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DisplayAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const display: DisplayAPI = {
|
||||
getSize(displayId?: number): DisplayGeometry {
|
||||
const all = this.listAll()
|
||||
if (displayId !== undefined) {
|
||||
const found = all.find(d => d.displayId === displayId)
|
||||
if (found) return found
|
||||
}
|
||||
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
|
||||
},
|
||||
|
||||
listAll(): DisplayGeometry[] {
|
||||
try {
|
||||
const raw = ps(`
|
||||
Add-Type -AssemblyName System.Windows.Forms
|
||||
$result = @()
|
||||
$idx = 0
|
||||
foreach ($s in [System.Windows.Forms.Screen]::AllScreens) {
|
||||
$result += "$($s.Bounds.Width),$($s.Bounds.Height),$idx,$($s.Primary)"
|
||||
$idx++
|
||||
}
|
||||
$result -join "|"
|
||||
`)
|
||||
return raw.split('|').filter(Boolean).map(entry => {
|
||||
const [w, h, id, primary] = entry.split(',')
|
||||
return {
|
||||
width: Number(w),
|
||||
height: Number(h),
|
||||
scaleFactor: 1, // Windows DPI scaling handled at system level
|
||||
displayId: Number(id),
|
||||
}
|
||||
})
|
||||
} catch {
|
||||
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// AppsAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const apps: AppsAPI = {
|
||||
async prepareDisplay(_allowlistBundleIds, _surrogateHost, _displayId) {
|
||||
return { activated: '', hidden: [] }
|
||||
},
|
||||
|
||||
async previewHideSet(_bundleIds, _displayId) {
|
||||
return []
|
||||
},
|
||||
|
||||
async findWindowDisplays(bundleIds) {
|
||||
return bundleIds.map(bundleId => ({ bundleId, displayIds: [0] }))
|
||||
},
|
||||
|
||||
async appUnderPoint(_x, _y) {
|
||||
try {
|
||||
const out = ps(`
|
||||
Add-Type @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
public class WinPt {
|
||||
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X; public int Y; }
|
||||
[DllImport("user32.dll")] public static extern IntPtr WindowFromPoint(POINT p);
|
||||
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint pid);
|
||||
}
|
||||
'@
|
||||
$pt = New-Object WinPt+POINT
|
||||
$pt.X = ${_x}; $pt.Y = ${_y}
|
||||
$hwnd = [WinPt]::WindowFromPoint($pt)
|
||||
$pid = [uint32]0
|
||||
[WinPt]::GetWindowThreadProcessId($hwnd, [ref]$pid) | Out-Null
|
||||
$proc = Get-Process -Id $pid -ErrorAction SilentlyContinue
|
||||
"$($proc.MainModule.FileName)|$($proc.ProcessName)"
|
||||
`)
|
||||
if (!out || !out.includes('|')) return null
|
||||
const [exePath, name] = out.split('|', 2)
|
||||
return { bundleId: exePath!, displayName: name! }
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
},
|
||||
|
||||
async listInstalled() {
|
||||
try {
|
||||
const raw = await psAsync(`
|
||||
$apps = @()
|
||||
$paths = @(
|
||||
'HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
|
||||
'HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
|
||||
'HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*'
|
||||
)
|
||||
foreach ($p in $paths) {
|
||||
Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object { $_.DisplayName } | ForEach-Object {
|
||||
$apps += "$($_.DisplayName)|$($_.InstallLocation)|$($_.PSChildName)"
|
||||
}
|
||||
}
|
||||
$apps | Select-Object -Unique | Select-Object -First 200
|
||||
`)
|
||||
return raw.split('\n').filter(Boolean).map(line => {
|
||||
const [name, path, id] = line.split('|', 3)
|
||||
return {
|
||||
bundleId: id ?? name ?? '',
|
||||
displayName: name ?? '',
|
||||
path: path ?? '',
|
||||
}
|
||||
})
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
iconDataUrl(_path) {
|
||||
return null
|
||||
},
|
||||
|
||||
listRunning() {
|
||||
try {
|
||||
const windows = listWindows()
|
||||
return windows.map(w => ({
|
||||
bundleId: String(w.hwnd),
|
||||
displayName: w.title,
|
||||
}))
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
async open(name) {
|
||||
// On Windows, name is the exe path (bundleId) or process name.
|
||||
// Try exe path first, fall back to process name lookup.
|
||||
const escaped = name.replace(/'/g, "''")
|
||||
await psAsync(`
|
||||
if (Test-Path '${escaped}') {
|
||||
Start-Process '${escaped}'
|
||||
} else {
|
||||
Start-Process -FilePath '${escaped}' -ErrorAction SilentlyContinue
|
||||
}`)
|
||||
},
|
||||
|
||||
async unhide(bundleIds) {
|
||||
// Windows: bring window to foreground
|
||||
for (const name of bundleIds) {
|
||||
await psAsync(`
|
||||
Add-Type @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
public class WinShow {
|
||||
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmd);
|
||||
[DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr hWnd);
|
||||
}
|
||||
'@
|
||||
$proc = Get-Process -Name "${name}" -ErrorAction SilentlyContinue | Select-Object -First 1
|
||||
if ($proc) { [WinShow]::ShowWindow($proc.MainWindowHandle, 9) | Out-Null; [WinShow]::SetForegroundWindow($proc.MainWindowHandle) | Out-Null }
|
||||
`)
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ScreenshotAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export const screenshot: ScreenshotAPI = {
|
||||
async captureExcluding(_allowedBundleIds, _quality, _targetW, _targetH, displayId) {
|
||||
const raw = await psAsync(`
|
||||
Add-Type -AssemblyName System.Windows.Forms
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
$screen = if (${displayId ?? -1} -ge 0) { [System.Windows.Forms.Screen]::AllScreens[${displayId ?? 0}] } else { [System.Windows.Forms.Screen]::PrimaryScreen }
|
||||
$bounds = $screen.Bounds
|
||||
$bmp = New-Object System.Drawing.Bitmap($bounds.Width, $bounds.Height)
|
||||
$g = [System.Drawing.Graphics]::FromImage($bmp)
|
||||
$g.CopyFromScreen($bounds.Location, [System.Drawing.Point]::Empty, $bounds.Size)
|
||||
$g.Dispose()
|
||||
$ms = New-Object System.IO.MemoryStream
|
||||
$bmp.Save($ms, [System.Drawing.Imaging.ImageFormat]::Png)
|
||||
$bmp.Dispose()
|
||||
$bytes = $ms.ToArray()
|
||||
$ms.Dispose()
|
||||
"$($bounds.Width),$($bounds.Height)," + [Convert]::ToBase64String($bytes)
|
||||
`)
|
||||
const firstComma = raw.indexOf(',')
|
||||
const secondComma = raw.indexOf(',', firstComma + 1)
|
||||
const width = Number(raw.slice(0, firstComma))
|
||||
const height = Number(raw.slice(firstComma + 1, secondComma))
|
||||
const base64 = raw.slice(secondComma + 1)
|
||||
return { base64, width, height }
|
||||
},
|
||||
|
||||
async captureRegion(_allowedBundleIds, x, y, w, h, _outW, _outH, _quality, _displayId) {
|
||||
const raw = await psAsync(`
|
||||
Add-Type -AssemblyName System.Windows.Forms
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
$bmp = New-Object System.Drawing.Bitmap(${w}, ${h})
|
||||
$g = [System.Drawing.Graphics]::FromImage($bmp)
|
||||
$g.CopyFromScreen(${x}, ${y}, 0, 0, (New-Object System.Drawing.Size(${w}, ${h})))
|
||||
$g.Dispose()
|
||||
$ms = New-Object System.IO.MemoryStream
|
||||
$bmp.Save($ms, [System.Drawing.Imaging.ImageFormat]::Png)
|
||||
$bmp.Dispose()
|
||||
$bytes = $ms.ToArray()
|
||||
$ms.Dispose()
|
||||
"${w},${h}," + [Convert]::ToBase64String($bytes)
|
||||
`)
|
||||
const firstComma = raw.indexOf(',')
|
||||
const secondComma = raw.indexOf(',', firstComma + 1)
|
||||
const base64 = raw.slice(secondComma + 1)
|
||||
return { base64, width: w, height: h }
|
||||
},
|
||||
|
||||
/**
|
||||
* Capture a specific window by title or HWND using PrintWindow.
|
||||
* Works even for occluded or background windows.
|
||||
*/
|
||||
captureWindowTarget(titleOrHwnd: string | number): ScreenshotResult | null {
|
||||
if (typeof titleOrHwnd === 'number') {
|
||||
return captureWindowByHwnd(titleOrHwnd)
|
||||
}
|
||||
return captureWindow(titleOrHwnd)
|
||||
},
|
||||
}
|
||||
@@ -1,377 +1,84 @@
|
||||
/**
|
||||
* @ant/computer-use-swift — macOS 实现
|
||||
* @ant/computer-use-swift — cross-platform display, apps, and screenshot API
|
||||
*
|
||||
* 用 AppleScript/JXA/screencapture 替代原始 Swift 原生模块。
|
||||
* 提供显示器信息、应用管理、截图等功能。
|
||||
* Platform backends:
|
||||
* - darwin: AppleScript/JXA + screencapture
|
||||
* - win32: PowerShell + System.Drawing + Win32 P/Invoke
|
||||
*
|
||||
* 仅 macOS 支持。
|
||||
* Add new platforms by creating backends/<platform>.ts implementing SwiftBackend.
|
||||
*/
|
||||
|
||||
import { readFileSync, unlinkSync } from 'fs'
|
||||
import { tmpdir } from 'os'
|
||||
import { join } from 'path'
|
||||
// Re-export all types
|
||||
export type {
|
||||
DisplayGeometry,
|
||||
PrepareDisplayResult,
|
||||
AppInfo,
|
||||
InstalledApp,
|
||||
RunningApp,
|
||||
ScreenshotResult,
|
||||
ResolvePrepareCaptureResult,
|
||||
WindowDisplayInfo,
|
||||
DisplayAPI,
|
||||
AppsAPI,
|
||||
ScreenshotAPI,
|
||||
SwiftBackend,
|
||||
} from './types.js'
|
||||
|
||||
import type { ResolvePrepareCaptureResult, SwiftBackend } from './types.js'
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Types (exported for callers)
|
||||
// Platform dispatch
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export interface DisplayGeometry {
|
||||
width: number
|
||||
height: number
|
||||
scaleFactor: number
|
||||
displayId: number
|
||||
}
|
||||
|
||||
export interface PrepareDisplayResult {
|
||||
activated: string
|
||||
hidden: string[]
|
||||
}
|
||||
|
||||
export interface AppInfo {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
}
|
||||
|
||||
export interface InstalledApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
path: string
|
||||
iconDataUrl?: string
|
||||
}
|
||||
|
||||
export interface RunningApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
}
|
||||
|
||||
export interface ScreenshotResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export interface ResolvePrepareCaptureResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export interface WindowDisplayInfo {
|
||||
bundleId: string
|
||||
displayIds: number[]
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
function jxaSync(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['osascript', '-l', 'JavaScript', '-e', script],
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
function osascriptSync(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['osascript', '-e', script],
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
async function osascript(script: string): Promise<string> {
|
||||
const proc = Bun.spawn(['osascript', '-e', script], {
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
const text = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return text.trim()
|
||||
}
|
||||
|
||||
async function jxa(script: string): Promise<string> {
|
||||
const proc = Bun.spawn(['osascript', '-l', 'JavaScript', '-e', script], {
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
const text = await new Response(proc.stdout).text()
|
||||
await proc.exited
|
||||
return text.trim()
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// DisplayAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface DisplayAPI {
|
||||
getSize(displayId?: number): DisplayGeometry
|
||||
listAll(): DisplayGeometry[]
|
||||
}
|
||||
|
||||
const displayAPI: DisplayAPI = {
|
||||
getSize(displayId?: number): DisplayGeometry {
|
||||
const all = this.listAll()
|
||||
if (displayId !== undefined) {
|
||||
const found = all.find(d => d.displayId === displayId)
|
||||
if (found) return found
|
||||
}
|
||||
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }
|
||||
},
|
||||
|
||||
listAll(): DisplayGeometry[] {
|
||||
try {
|
||||
const raw = jxaSync(`
|
||||
ObjC.import("CoreGraphics");
|
||||
var displays = $.CGDisplayCopyAllDisplayModes ? [] : [];
|
||||
var active = $.CGGetActiveDisplayList(10, null, Ref());
|
||||
var countRef = Ref();
|
||||
$.CGGetActiveDisplayList(0, null, countRef);
|
||||
var count = countRef[0];
|
||||
var idBuf = Ref();
|
||||
$.CGGetActiveDisplayList(count, idBuf, countRef);
|
||||
var result = [];
|
||||
for (var i = 0; i < count; i++) {
|
||||
var did = idBuf[i];
|
||||
var w = $.CGDisplayPixelsWide(did);
|
||||
var h = $.CGDisplayPixelsHigh(did);
|
||||
var mode = $.CGDisplayCopyDisplayMode(did);
|
||||
var pw = $.CGDisplayModeGetPixelWidth(mode);
|
||||
var sf = pw > 0 && w > 0 ? pw / w : 2;
|
||||
result.push({width: w, height: h, scaleFactor: sf, displayId: did});
|
||||
}
|
||||
JSON.stringify(result);
|
||||
`)
|
||||
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
|
||||
width: Number(d.width), height: Number(d.height),
|
||||
scaleFactor: Number(d.scaleFactor), displayId: Number(d.displayId),
|
||||
}))
|
||||
} catch {
|
||||
// Fallback: use NSScreen via JXA
|
||||
try {
|
||||
const raw = jxaSync(`
|
||||
ObjC.import("AppKit");
|
||||
var screens = $.NSScreen.screens;
|
||||
var result = [];
|
||||
for (var i = 0; i < screens.count; i++) {
|
||||
var s = screens.objectAtIndex(i);
|
||||
var frame = s.frame;
|
||||
var desc = s.deviceDescription;
|
||||
var screenNumber = desc.objectForKey($("NSScreenNumber")).intValue;
|
||||
var backingFactor = s.backingScaleFactor;
|
||||
result.push({
|
||||
width: Math.round(frame.size.width),
|
||||
height: Math.round(frame.size.height),
|
||||
scaleFactor: backingFactor,
|
||||
displayId: screenNumber
|
||||
});
|
||||
}
|
||||
JSON.stringify(result);
|
||||
`)
|
||||
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
|
||||
width: Number(d.width),
|
||||
height: Number(d.height),
|
||||
scaleFactor: Number(d.scaleFactor),
|
||||
displayId: Number(d.displayId),
|
||||
}))
|
||||
} catch {
|
||||
return [{ width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }]
|
||||
}
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// AppsAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface AppsAPI {
|
||||
prepareDisplay(allowlistBundleIds: string[], surrogateHost: string, displayId?: number): Promise<PrepareDisplayResult>
|
||||
previewHideSet(bundleIds: string[], displayId?: number): Promise<AppInfo[]>
|
||||
findWindowDisplays(bundleIds: string[]): Promise<WindowDisplayInfo[]>
|
||||
appUnderPoint(x: number, y: number): Promise<AppInfo | null>
|
||||
listInstalled(): Promise<InstalledApp[]>
|
||||
iconDataUrl(path: string): string | null
|
||||
listRunning(): RunningApp[]
|
||||
open(bundleId: string): Promise<void>
|
||||
unhide(bundleIds: string[]): Promise<void>
|
||||
}
|
||||
|
||||
const appsAPI: AppsAPI = {
|
||||
async prepareDisplay(
|
||||
_allowlistBundleIds: string[],
|
||||
_surrogateHost: string,
|
||||
_displayId?: number,
|
||||
): Promise<PrepareDisplayResult> {
|
||||
return { activated: '', hidden: [] }
|
||||
},
|
||||
|
||||
async previewHideSet(
|
||||
_bundleIds: string[],
|
||||
_displayId?: number,
|
||||
): Promise<AppInfo[]> {
|
||||
return []
|
||||
},
|
||||
|
||||
async findWindowDisplays(bundleIds: string[]): Promise<WindowDisplayInfo[]> {
|
||||
// Each running app is assumed to be on display 1
|
||||
return bundleIds.map(bundleId => ({ bundleId, displayIds: [1] }))
|
||||
},
|
||||
|
||||
async appUnderPoint(_x: number, _y: number): Promise<AppInfo | null> {
|
||||
// Use JXA to find app at mouse position via accessibility
|
||||
try {
|
||||
const result = await jxa(`
|
||||
ObjC.import("CoreGraphics");
|
||||
ObjC.import("AppKit");
|
||||
var pt = $.CGPointMake(${_x}, ${_y});
|
||||
// Get frontmost app as a fallback
|
||||
var app = $.NSWorkspace.sharedWorkspace.frontmostApplication;
|
||||
JSON.stringify({bundleId: app.bundleIdentifier.js, displayName: app.localizedName.js});
|
||||
`)
|
||||
return JSON.parse(result)
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
},
|
||||
|
||||
async listInstalled(): Promise<InstalledApp[]> {
|
||||
try {
|
||||
const result = await osascript(`
|
||||
tell application "System Events"
|
||||
set appList to ""
|
||||
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
|
||||
set appPath to POSIX path of (appFile as alias)
|
||||
set appName to name of appFile
|
||||
set appList to appList & appPath & "|" & appName & "\\n"
|
||||
end repeat
|
||||
return appList
|
||||
end tell
|
||||
`)
|
||||
return result.split('\n').filter(Boolean).map(line => {
|
||||
const [path, name] = line.split('|', 2)
|
||||
// Derive bundleId from Info.plist would be ideal, but use path-based fallback
|
||||
const displayName = (name ?? '').replace(/\.app$/, '')
|
||||
return {
|
||||
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
|
||||
displayName,
|
||||
path: path ?? '',
|
||||
}
|
||||
})
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
iconDataUrl(_path: string): string | null {
|
||||
return null
|
||||
},
|
||||
|
||||
listRunning(): RunningApp[] {
|
||||
try {
|
||||
const raw = jxaSync(`
|
||||
var apps = Application("System Events").applicationProcesses.whose({backgroundOnly: false});
|
||||
var result = [];
|
||||
for (var i = 0; i < apps.length; i++) {
|
||||
try {
|
||||
var a = apps[i];
|
||||
result.push({bundleId: a.bundleIdentifier(), displayName: a.name()});
|
||||
} catch(e) {}
|
||||
}
|
||||
JSON.stringify(result);
|
||||
`)
|
||||
return JSON.parse(raw)
|
||||
} catch {
|
||||
return []
|
||||
}
|
||||
},
|
||||
|
||||
async open(bundleId: string): Promise<void> {
|
||||
await osascript(`tell application id "${bundleId}" to activate`)
|
||||
},
|
||||
|
||||
async unhide(bundleIds: string[]): Promise<void> {
|
||||
for (const bundleId of bundleIds) {
|
||||
await osascript(`
|
||||
tell application "System Events"
|
||||
set visible of application process (name of application process whose bundle identifier is "${bundleId}") to true
|
||||
end tell
|
||||
`)
|
||||
}
|
||||
},
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ScreenshotAPI
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
interface ScreenshotAPI {
|
||||
captureExcluding(
|
||||
allowedBundleIds: string[], quality: number,
|
||||
targetW: number, targetH: number, displayId?: number,
|
||||
): Promise<ScreenshotResult>
|
||||
captureRegion(
|
||||
allowedBundleIds: string[],
|
||||
x: number, y: number, w: number, h: number,
|
||||
outW: number, outH: number, quality: number, displayId?: number,
|
||||
): Promise<ScreenshotResult>
|
||||
}
|
||||
|
||||
async function captureScreenToBase64(args: string[]): Promise<{ base64: string; width: number; height: number }> {
|
||||
const tmpFile = join(tmpdir(), `cu-screenshot-${Date.now()}.png`)
|
||||
const proc = Bun.spawn(['screencapture', ...args, tmpFile], {
|
||||
stdout: 'pipe', stderr: 'pipe',
|
||||
})
|
||||
await proc.exited
|
||||
|
||||
function loadBackend(): SwiftBackend | null {
|
||||
try {
|
||||
const buf = readFileSync(tmpFile)
|
||||
const base64 = buf.toString('base64')
|
||||
// Parse PNG header for dimensions (bytes 16-23)
|
||||
const width = buf.readUInt32BE(16)
|
||||
const height = buf.readUInt32BE(20)
|
||||
return { base64, width, height }
|
||||
} finally {
|
||||
try { unlinkSync(tmpFile) } catch {}
|
||||
switch (process.platform) {
|
||||
case 'darwin':
|
||||
return require('./backends/darwin.js') as SwiftBackend
|
||||
case 'win32':
|
||||
return require('./backends/win32.js') as SwiftBackend
|
||||
case 'linux':
|
||||
return require('./backends/linux.js') as SwiftBackend
|
||||
default:
|
||||
return null
|
||||
}
|
||||
} catch {
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
const screenshotAPI: ScreenshotAPI = {
|
||||
async captureExcluding(
|
||||
_allowedBundleIds: string[],
|
||||
_quality: number,
|
||||
_targetW: number,
|
||||
_targetH: number,
|
||||
displayId?: number,
|
||||
): Promise<ScreenshotResult> {
|
||||
const args = ['-x'] // silent
|
||||
if (displayId !== undefined) {
|
||||
args.push('-D', String(displayId))
|
||||
}
|
||||
return captureScreenToBase64(args)
|
||||
},
|
||||
|
||||
async captureRegion(
|
||||
_allowedBundleIds: string[],
|
||||
x: number, y: number, w: number, h: number,
|
||||
_outW: number, _outH: number, _quality: number,
|
||||
displayId?: number,
|
||||
): Promise<ScreenshotResult> {
|
||||
const args = ['-x', '-R', `${x},${y},${w},${h}`]
|
||||
if (displayId !== undefined) {
|
||||
args.push('-D', String(displayId))
|
||||
}
|
||||
return captureScreenToBase64(args)
|
||||
},
|
||||
}
|
||||
const backend = loadBackend()
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// ComputerUseAPI — Main export
|
||||
// ComputerUseAPI — Main export (preserves original class interface)
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
export class ComputerUseAPI {
|
||||
apps: AppsAPI = appsAPI
|
||||
display: DisplayAPI = displayAPI
|
||||
screenshot: ScreenshotAPI = screenshotAPI
|
||||
// When no backend is loaded (unsupported platform), all APIs are no-op stubs.
|
||||
// These stubs should never be reached in practice — callers check isSupported
|
||||
// or the feature gate before invoking.
|
||||
|
||||
apps = backend?.apps ?? {
|
||||
async prepareDisplay() { return { activated: '', hidden: [] } },
|
||||
async previewHideSet() { return [] },
|
||||
async findWindowDisplays(ids: string[]) { return ids.map(b => ({ bundleId: b, displayIds: [] as number[] })) },
|
||||
async appUnderPoint() { return null },
|
||||
async listInstalled() { return [] },
|
||||
iconDataUrl() { return null },
|
||||
listRunning() { return [] },
|
||||
async open() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
async unhide() {},
|
||||
}
|
||||
|
||||
display = backend?.display ?? {
|
||||
getSize() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
listAll() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
}
|
||||
|
||||
screenshot = backend?.screenshot ?? {
|
||||
async captureExcluding() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
async captureRegion() { throw new Error('computer-use-swift: no backend for this platform') },
|
||||
}
|
||||
|
||||
async resolvePrepareCapture(
|
||||
allowedBundleIds: string[],
|
||||
|
||||
80
packages/@ant/computer-use-swift/src/types.ts
Normal file
80
packages/@ant/computer-use-swift/src/types.ts
Normal file
@@ -0,0 +1,80 @@
|
||||
export interface DisplayGeometry {
|
||||
width: number
|
||||
height: number
|
||||
scaleFactor: number
|
||||
displayId: number
|
||||
}
|
||||
|
||||
export interface PrepareDisplayResult {
|
||||
activated: string
|
||||
hidden: string[]
|
||||
}
|
||||
|
||||
export interface AppInfo {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
}
|
||||
|
||||
export interface InstalledApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
path: string
|
||||
iconDataUrl?: string
|
||||
}
|
||||
|
||||
export interface RunningApp {
|
||||
bundleId: string
|
||||
displayName: string
|
||||
}
|
||||
|
||||
export interface ScreenshotResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export interface ResolvePrepareCaptureResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export interface WindowDisplayInfo {
|
||||
bundleId: string
|
||||
displayIds: number[]
|
||||
}
|
||||
|
||||
export interface DisplayAPI {
|
||||
getSize(displayId?: number): DisplayGeometry
|
||||
listAll(): DisplayGeometry[]
|
||||
}
|
||||
|
||||
export interface AppsAPI {
|
||||
prepareDisplay(allowlistBundleIds: string[], surrogateHost: string, displayId?: number): Promise<PrepareDisplayResult>
|
||||
previewHideSet(bundleIds: string[], displayId?: number): Promise<AppInfo[]>
|
||||
findWindowDisplays(bundleIds: string[]): Promise<WindowDisplayInfo[]>
|
||||
appUnderPoint(x: number, y: number): Promise<AppInfo | null>
|
||||
listInstalled(): Promise<InstalledApp[]>
|
||||
iconDataUrl(path: string): string | null
|
||||
listRunning(): RunningApp[]
|
||||
open(bundleId: string): Promise<void>
|
||||
unhide(bundleIds: string[]): Promise<void>
|
||||
}
|
||||
|
||||
export interface ScreenshotAPI {
|
||||
captureExcluding(
|
||||
allowedBundleIds: string[], quality: number,
|
||||
targetW: number, targetH: number, displayId?: number,
|
||||
): Promise<ScreenshotResult>
|
||||
captureRegion(
|
||||
allowedBundleIds: string[],
|
||||
x: number, y: number, w: number, h: number,
|
||||
outW: number, outH: number, quality: number, displayId?: number,
|
||||
): Promise<ScreenshotResult>
|
||||
}
|
||||
|
||||
export interface SwiftBackend {
|
||||
display: DisplayAPI
|
||||
apps: AppsAPI
|
||||
screenshot: ScreenshotAPI
|
||||
}
|
||||
@@ -15,7 +15,7 @@ const defineArgs = Object.entries(defines).flatMap(([k, v]) => [
|
||||
|
||||
// Bun --feature flags: enable feature() gates at runtime.
|
||||
// Default features enabled in dev mode.
|
||||
const DEFAULT_FEATURES = ["BUDDY", "TRANSCRIPT_CLASSIFIER", "BRIDGE_MODE", "AGENT_TRIGGERS_REMOTE", "VOICE_MODE"];
|
||||
const DEFAULT_FEATURES = ["BUDDY", "TRANSCRIPT_CLASSIFIER", "BRIDGE_MODE", "AGENT_TRIGGERS_REMOTE", "CHICAGO_MCP", "VOICE_MODE"];
|
||||
|
||||
// Any env var matching FEATURE_<NAME>=1 will also enable that feature.
|
||||
// e.g. FEATURE_PROACTIVE=1 bun run dev
|
||||
|
||||
@@ -1602,7 +1602,7 @@ async function run(): Promise<CommanderCommand> {
|
||||
// `type: 'stdio'`. An enterprise-config ant with the GB gate on would
|
||||
// otherwise process.exit(1). Chrome has the same latent issue but has
|
||||
// shipped without incident; chicago places itself correctly.
|
||||
if (feature('CHICAGO_MCP') && getPlatform() === 'macos' && !getIsNonInteractiveSession()) {
|
||||
if (feature('CHICAGO_MCP') && !getIsNonInteractiveSession()) {
|
||||
try {
|
||||
const {
|
||||
getChicagoEnabled
|
||||
|
||||
@@ -52,8 +52,8 @@ export function getTerminalBundleId(): string | null {
|
||||
* takes this shape (no `hostBundleId`, no `teachMode`).
|
||||
*/
|
||||
export const CLI_CU_CAPABILITIES = {
|
||||
screenshotFiltering: 'native' as const,
|
||||
platform: 'darwin' as const,
|
||||
screenshotFiltering: (process.platform === 'darwin' ? 'native' : 'none') as any,
|
||||
platform: (process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin') as any,
|
||||
}
|
||||
|
||||
export function isComputerUseMCPServer(name: string): boolean {
|
||||
|
||||
@@ -59,6 +59,7 @@ export const releasePump = release
|
||||
* concurrent drainRunLoop() calls share one setInterval.
|
||||
*/
|
||||
export async function drainRunLoop<T>(fn: () => Promise<T>): Promise<T> {
|
||||
if (process.platform !== 'darwin') return fn()
|
||||
retain()
|
||||
let timer: ReturnType<typeof setTimeout> | undefined
|
||||
try {
|
||||
|
||||
@@ -23,6 +23,7 @@ import { requireComputerUseSwift } from './swiftLoader.js'
|
||||
let registered = false
|
||||
|
||||
export function registerEscHotkey(onEscape: () => void): boolean {
|
||||
if (process.platform !== 'darwin') return false
|
||||
if (registered) return true
|
||||
const cu = requireComputerUseSwift()
|
||||
if (!(cu as any).hotkey.registerEscape(onEscape)) {
|
||||
|
||||
@@ -68,6 +68,24 @@ function computeTargetDims(
|
||||
}
|
||||
|
||||
async function readClipboardViaPbpaste(): Promise<string> {
|
||||
if (process.platform === 'win32') {
|
||||
const { stdout, code } = await execFileNoThrow('powershell', ['-NoProfile', '-Command', 'Get-Clipboard'], {
|
||||
useCwd: false,
|
||||
})
|
||||
if (code !== 0) {
|
||||
throw new Error(`PowerShell Get-Clipboard exited with code ${code}`)
|
||||
}
|
||||
return stdout
|
||||
}
|
||||
if (process.platform === 'linux') {
|
||||
const { stdout, code } = await execFileNoThrow('xclip', ['-selection', 'clipboard', '-o'], {
|
||||
useCwd: false,
|
||||
})
|
||||
if (code !== 0) {
|
||||
throw new Error(`xclip exited with code ${code}`)
|
||||
}
|
||||
return stdout
|
||||
}
|
||||
const { stdout, code } = await execFileNoThrow('pbpaste', [], {
|
||||
useCwd: false,
|
||||
})
|
||||
@@ -78,6 +96,25 @@ async function readClipboardViaPbpaste(): Promise<string> {
|
||||
}
|
||||
|
||||
async function writeClipboardViaPbcopy(text: string): Promise<void> {
|
||||
if (process.platform === 'win32') {
|
||||
const { code } = await execFileNoThrow('powershell', ['-NoProfile', '-Command', `Set-Clipboard -Value '${text.replace(/'/g, "''")}'`], {
|
||||
useCwd: false,
|
||||
})
|
||||
if (code !== 0) {
|
||||
throw new Error(`PowerShell Set-Clipboard exited with code ${code}`)
|
||||
}
|
||||
return
|
||||
}
|
||||
if (process.platform === 'linux') {
|
||||
const { code } = await execFileNoThrow('xclip', ['-selection', 'clipboard'], {
|
||||
input: text,
|
||||
useCwd: false,
|
||||
})
|
||||
if (code !== 0) {
|
||||
throw new Error(`xclip exited with code ${code}`)
|
||||
}
|
||||
return
|
||||
}
|
||||
const { code } = await execFileNoThrow('pbcopy', [], {
|
||||
input: text,
|
||||
useCwd: false,
|
||||
@@ -192,7 +229,7 @@ async function typeViaClipboard(input: Input, text: string): Promise<void> {
|
||||
if ((await readClipboardViaPbpaste()) !== text) {
|
||||
throw new Error('Clipboard write did not round-trip.')
|
||||
}
|
||||
await input.keys(['command', 'v'])
|
||||
await input.keys([process.platform === 'darwin' ? 'command' : 'ctrl', 'v'])
|
||||
await sleep(100)
|
||||
} finally {
|
||||
if (typeof saved === 'string') {
|
||||
@@ -260,9 +297,9 @@ export function createCliExecutor(opts: {
|
||||
getMouseAnimationEnabled: () => boolean
|
||||
getHideBeforeActionEnabled: () => boolean
|
||||
}): ComputerExecutor {
|
||||
if (process.platform !== 'darwin') {
|
||||
if (process.platform !== 'darwin' && process.platform !== 'win32' && process.platform !== 'linux') {
|
||||
throw new Error(
|
||||
`createCliExecutor called on ${process.platform}. Computer control is macOS-only.`,
|
||||
`createCliExecutor called on ${process.platform}. Computer control requires macOS, Windows, or Linux.`,
|
||||
)
|
||||
}
|
||||
|
||||
@@ -377,7 +414,7 @@ export function createCliExecutor(opts: {
|
||||
d.height,
|
||||
d.scaleFactor,
|
||||
)
|
||||
return drainRunLoop(() =>
|
||||
const raw = await drainRunLoop(() =>
|
||||
cu.resolvePrepareCapture(
|
||||
withoutTerminal(opts.allowedBundleIds),
|
||||
surrogateHost,
|
||||
@@ -389,6 +426,14 @@ export function createCliExecutor(opts: {
|
||||
opts.doHide,
|
||||
),
|
||||
)
|
||||
// Ensure the result has fields expected by toolCalls.ts (hidden, displayId).
|
||||
// macOS native returns these from Swift; our cross-platform ComputerUseAPI
|
||||
// returns {base64, width, height} — fill in the missing fields.
|
||||
return {
|
||||
...raw,
|
||||
hidden: (raw as any).hidden ?? [],
|
||||
displayId: (raw as any).displayId ?? opts.preferredDisplayId ?? d.displayId,
|
||||
}
|
||||
},
|
||||
|
||||
/**
|
||||
|
||||
@@ -10,7 +10,7 @@ type ChicagoConfig = CuSubGates & {
|
||||
}
|
||||
|
||||
const DEFAULTS: ChicagoConfig = {
|
||||
enabled: false,
|
||||
enabled: true,
|
||||
pixelValidation: false,
|
||||
clipboardPasteMultiline: true,
|
||||
mouseAnimation: true,
|
||||
@@ -37,9 +37,7 @@ function readConfig(): ChicagoConfig {
|
||||
// regardless of subscription tier — not all ants are max/pro, and per
|
||||
// CLAUDE.md:281, USER_TYPE !== 'ant' branches get zero antfooding.
|
||||
function hasRequiredSubscription(): boolean {
|
||||
if (process.env.USER_TYPE === 'ant') return true
|
||||
const tier = getSubscriptionType()
|
||||
return tier === 'max' || tier === 'pro'
|
||||
return true
|
||||
}
|
||||
|
||||
export function getChicagoEnabled(): boolean {
|
||||
|
||||
@@ -45,6 +45,7 @@ export function getComputerUseHostAdapter(): ComputerUseHostAdapter {
|
||||
getHideBeforeActionEnabled: () => getChicagoSubGates().hideBeforeAction,
|
||||
}),
|
||||
ensureOsPermissions: async () => {
|
||||
if (process.platform !== 'darwin') return { granted: true }
|
||||
const cu = requireComputerUseSwift()
|
||||
const accessibility = (cu as any).tcc.checkAccessibility()
|
||||
const screenRecording = (cu as any).tcc.checkScreenRecording()
|
||||
|
||||
@@ -13,11 +13,17 @@ let cached: ComputerUseAPI | undefined
|
||||
* these in drainRunLoop().
|
||||
*/
|
||||
export function requireComputerUseSwift(): ComputerUseAPI {
|
||||
if (process.platform !== 'darwin') {
|
||||
throw new Error('@ant/computer-use-swift is macOS-only')
|
||||
}
|
||||
if (cached) return cached
|
||||
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
||||
return (cached ??= require('@ant/computer-use-swift') as ComputerUseAPI)
|
||||
const mod = require('@ant/computer-use-swift')
|
||||
// macOS native .node exports a plain object with apps/display/screenshot directly.
|
||||
// Our cross-platform package exports { ComputerUseAPI } class — needs instantiation.
|
||||
if (mod.ComputerUseAPI && typeof mod.ComputerUseAPI === 'function') {
|
||||
cached = new mod.ComputerUseAPI() as ComputerUseAPI
|
||||
} else {
|
||||
cached = mod as ComputerUseAPI
|
||||
}
|
||||
return cached
|
||||
}
|
||||
|
||||
export type { ComputerUseAPI }
|
||||
|
||||
257
src/utils/computerUse/win32/ocr.ts
Normal file
257
src/utils/computerUse/win32/ocr.ts
Normal file
@@ -0,0 +1,257 @@
|
||||
/**
|
||||
* OCR module using Windows.Media.Ocr.OcrEngine via PowerShell.
|
||||
* Captures a screen region or window, then runs WinRT OCR to extract text.
|
||||
*/
|
||||
|
||||
export interface OcrLine {
|
||||
text: string
|
||||
bounds: { x: number; y: number; w: number; h: number }
|
||||
}
|
||||
|
||||
export interface OcrResult {
|
||||
text: string
|
||||
lines: OcrLine[]
|
||||
language: string
|
||||
}
|
||||
|
||||
function emptyResult(language: string): OcrResult {
|
||||
return { text: '', lines: [], language }
|
||||
}
|
||||
|
||||
function runPs(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* PowerShell script that:
|
||||
* 1. Screenshots a screen region using CopyFromScreen
|
||||
* 2. Saves to temp PNG
|
||||
* 3. Loads via WinRT BitmapDecoder -> SoftwareBitmap
|
||||
* 4. Runs OcrEngine.RecognizeAsync
|
||||
* 5. Outputs JSON with text, lines, and bounding rects
|
||||
*/
|
||||
function buildOcrRegionScript(
|
||||
x: number,
|
||||
y: number,
|
||||
w: number,
|
||||
h: number,
|
||||
lang: string,
|
||||
): string {
|
||||
return `
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
Add-Type -AssemblyName System.Runtime.WindowsRuntime
|
||||
|
||||
# Load WinRT types
|
||||
$null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Graphics.Imaging.SoftwareBitmap, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Graphics.Imaging.BitmapDecoder, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Storage.StorageFile, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Storage.Streams.RandomAccessStream, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Globalization.Language, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
|
||||
# Await helper for WinRT async operations
|
||||
$asTaskGeneric = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
|
||||
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and
|
||||
$_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation\`1'
|
||||
})[0]
|
||||
Function Await($WinRtTask, $ResultType) {
|
||||
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
|
||||
$netTask = $asTask.Invoke($null, @($WinRtTask))
|
||||
$netTask.Wait(-1) | Out-Null
|
||||
$netTask.Result
|
||||
}
|
||||
|
||||
try {
|
||||
# Step 1: Screenshot region
|
||||
$bmp = New-Object System.Drawing.Bitmap(${w}, ${h})
|
||||
$g = [System.Drawing.Graphics]::FromImage($bmp)
|
||||
$g.CopyFromScreen(${x}, ${y}, 0, 0, (New-Object System.Drawing.Size(${w}, ${h})))
|
||||
$g.Dispose()
|
||||
|
||||
# Step 2: Save to temp file
|
||||
$tmpFile = [System.IO.Path]::Combine([System.IO.Path]::GetTempPath(), "ocrtemp_$([guid]::NewGuid().ToString('N')).png")
|
||||
$bmp.Save($tmpFile, [System.Drawing.Imaging.ImageFormat]::Png)
|
||||
$bmp.Dispose()
|
||||
|
||||
# Step 3: Open as StorageFile -> BitmapDecoder -> SoftwareBitmap
|
||||
$storageFile = Await ([Windows.Storage.StorageFile]::GetFileFromPathAsync($tmpFile)) ([Windows.Storage.StorageFile])
|
||||
$stream = Await ($storageFile.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
|
||||
$decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
|
||||
$softwareBmp = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
|
||||
|
||||
# Step 4: Create OCR engine
|
||||
$ocrLang = New-Object Windows.Globalization.Language('${lang}')
|
||||
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
|
||||
if ($engine -eq $null) {
|
||||
# Fallback to en-US
|
||||
$ocrLang = New-Object Windows.Globalization.Language('en-US')
|
||||
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
|
||||
}
|
||||
if ($engine -eq $null) {
|
||||
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
|
||||
return
|
||||
}
|
||||
|
||||
# Step 5: Run OCR
|
||||
$ocrResult = Await ($engine.RecognizeAsync($softwareBmp)) ([Windows.Media.Ocr.OcrResult])
|
||||
|
||||
# Step 6: Extract lines with bounding rects
|
||||
$lines = @()
|
||||
foreach ($line in $ocrResult.Lines) {
|
||||
$minX = [double]::MaxValue; $minY = [double]::MaxValue
|
||||
$maxX = 0.0; $maxY = 0.0
|
||||
foreach ($word in $line.Words) {
|
||||
$r = $word.BoundingRect
|
||||
if ($r.X -lt $minX) { $minX = $r.X }
|
||||
if ($r.Y -lt $minY) { $minY = $r.Y }
|
||||
if (($r.X + $r.Width) -gt $maxX) { $maxX = $r.X + $r.Width }
|
||||
if (($r.Y + $r.Height) -gt $maxY) { $maxY = $r.Y + $r.Height }
|
||||
}
|
||||
$lines += @{
|
||||
text = $line.Text
|
||||
bounds = @{
|
||||
x = [int]$minX
|
||||
y = [int]$minY
|
||||
w = [int]($maxX - $minX)
|
||||
h = [int]($maxY - $minY)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$output = @{
|
||||
text = $ocrResult.Text
|
||||
lines = $lines
|
||||
language = $ocrLang.LanguageTag
|
||||
}
|
||||
Write-Output (ConvertTo-Json $output -Depth 4 -Compress)
|
||||
|
||||
# Cleanup
|
||||
$stream.Dispose()
|
||||
Remove-Item $tmpFile -ErrorAction SilentlyContinue
|
||||
} catch {
|
||||
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
|
||||
}
|
||||
`
|
||||
}
|
||||
|
||||
/**
|
||||
* PowerShell script to get a window's bounding rect by title.
|
||||
*/
|
||||
function buildGetWindowRectScript(windowTitle: string): string {
|
||||
const escaped = windowTitle.replace(/'/g, "''")
|
||||
return `
|
||||
Add-Type @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
public class WinRect {
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||
public static extern IntPtr FindWindow(string c, string t);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct RECT { public int L, T, R, B; }
|
||||
public static string Get(string title) {
|
||||
IntPtr hwnd = FindWindow(null, title);
|
||||
if (hwnd == IntPtr.Zero) return "NOT_FOUND";
|
||||
RECT r; GetWindowRect(hwnd, out r);
|
||||
int w = r.R - r.L; int h = r.B - r.T;
|
||||
if (w <= 0 || h <= 0) return "INVALID_SIZE";
|
||||
return r.L + "," + r.T + "," + w + "," + h;
|
||||
}
|
||||
}
|
||||
'@
|
||||
[WinRect]::Get('${escaped}')
|
||||
`
|
||||
}
|
||||
|
||||
function parseOcrOutput(raw: string, lang: string): OcrResult {
|
||||
if (!raw) return emptyResult(lang)
|
||||
try {
|
||||
const parsed = JSON.parse(raw)
|
||||
return {
|
||||
text: parsed.text ?? '',
|
||||
lines: Array.isArray(parsed.lines)
|
||||
? parsed.lines.map((l: any) => ({
|
||||
text: l.text ?? '',
|
||||
bounds: {
|
||||
x: l.bounds?.x ?? 0,
|
||||
y: l.bounds?.y ?? 0,
|
||||
w: l.bounds?.w ?? 0,
|
||||
h: l.bounds?.h ?? 0,
|
||||
},
|
||||
}))
|
||||
: [],
|
||||
language: parsed.language ?? lang,
|
||||
}
|
||||
} catch {
|
||||
return emptyResult(lang)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform OCR on a screen region.
|
||||
* Screenshots the specified rectangle, then runs WinRT OcrEngine.
|
||||
*
|
||||
* @param x - Left coordinate
|
||||
* @param y - Top coordinate
|
||||
* @param w - Width in pixels
|
||||
* @param h - Height in pixels
|
||||
* @param lang - BCP-47 language tag (default 'en-US'). Confirmed: 'en-US', 'zh-Hans-CN'
|
||||
*/
|
||||
export async function ocrRegion(
|
||||
x: number,
|
||||
y: number,
|
||||
w: number,
|
||||
h: number,
|
||||
lang?: string,
|
||||
): Promise<OcrResult> {
|
||||
const language = lang ?? 'en-US'
|
||||
if (w <= 0 || h <= 0) return emptyResult(language)
|
||||
|
||||
try {
|
||||
const script = buildOcrRegionScript(x, y, w, h, language)
|
||||
const raw = runPs(script)
|
||||
return parseOcrOutput(raw, language)
|
||||
} catch {
|
||||
return emptyResult(language)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform OCR on a specific window by its title.
|
||||
* Gets the window rect, then delegates to ocrRegion.
|
||||
*
|
||||
* @param windowTitle - Exact window title to find via FindWindow
|
||||
* @param lang - BCP-47 language tag (default 'en-US')
|
||||
*/
|
||||
export async function ocrWindow(
|
||||
windowTitle: string,
|
||||
lang?: string,
|
||||
): Promise<OcrResult> {
|
||||
const language = lang ?? 'en-US'
|
||||
|
||||
try {
|
||||
const rectScript = buildGetWindowRectScript(windowTitle)
|
||||
const raw = runPs(rectScript)
|
||||
const trimmed = raw.trim()
|
||||
|
||||
if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') {
|
||||
return emptyResult(language)
|
||||
}
|
||||
|
||||
const parts = trimmed.split(',')
|
||||
if (parts.length !== 4) return emptyResult(language)
|
||||
|
||||
const [x, y, w, h] = parts.map(Number)
|
||||
if (!w || !h) return emptyResult(language)
|
||||
|
||||
return ocrRegion(x, y, w, h, lang)
|
||||
} catch {
|
||||
return emptyResult(language)
|
||||
}
|
||||
}
|
||||
308
src/utils/computerUse/win32/uiAutomation.ts
Normal file
308
src/utils/computerUse/win32/uiAutomation.ts
Normal file
@@ -0,0 +1,308 @@
|
||||
/**
|
||||
* Windows UI Automation module
|
||||
*
|
||||
* Provides UI element tree inspection, element lookup, programmatic click,
|
||||
* value setting, and hit-testing via PowerShell + System.Windows.Automation.
|
||||
*/
|
||||
|
||||
export interface UIElement {
|
||||
name: string
|
||||
controlType: string // Button, Edit, Text, List, Window, etc.
|
||||
automationId: string
|
||||
boundingRect: { x: number; y: number; w: number; h: number }
|
||||
isEnabled: boolean
|
||||
value?: string
|
||||
children?: UIElement[]
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const UIA_ASSEMBLIES = `
|
||||
Add-Type -AssemblyName UIAutomationClient
|
||||
Add-Type -AssemblyName UIAutomationTypes
|
||||
Add-Type -AssemblyName WindowsBase
|
||||
`
|
||||
|
||||
function ps(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
function parseJsonSafe<T>(raw: string, fallback: T): T {
|
||||
try {
|
||||
if (!raw) return fallback
|
||||
return JSON.parse(raw) as T
|
||||
} catch {
|
||||
return fallback
|
||||
}
|
||||
}
|
||||
|
||||
// PowerShell snippet that finds a window by exact or partial title match.
|
||||
// Assumes $title is already set in the calling script.
|
||||
const PS_FIND_WINDOW = `
|
||||
$root = [System.Windows.Automation.AutomationElement]::RootElement
|
||||
$window = $root.FindFirst(
|
||||
[System.Windows.Automation.TreeScope]::Children,
|
||||
[System.Windows.Automation.PropertyCondition]::new(
|
||||
[System.Windows.Automation.AutomationElement]::NameProperty, $title))
|
||||
if ($window -eq $null) {
|
||||
$all = $root.FindAll(
|
||||
[System.Windows.Automation.TreeScope]::Children,
|
||||
[System.Windows.Automation.Condition]::TrueCondition)
|
||||
foreach ($el in $all) {
|
||||
if ($el.Current.Name -and $el.Current.Name.Contains($title)) {
|
||||
$window = $el
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
`
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Get the UI element tree of a window, up to `depth` levels deep (default 3).
|
||||
*/
|
||||
export function getUITree(windowTitle: string, depth: number = 3): UIElement[] {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output '[]'
|
||||
exit
|
||||
}
|
||||
|
||||
function Get-UIChildren($parent, $currentDepth, $maxDepth) {
|
||||
if ($currentDepth -ge $maxDepth) { return @() }
|
||||
$children = $parent.FindAll(
|
||||
[System.Windows.Automation.TreeScope]::Children,
|
||||
[System.Windows.Automation.Condition]::TrueCondition)
|
||||
$result = @()
|
||||
foreach ($el in $children) {
|
||||
$rect = $el.Current.BoundingRectangle
|
||||
$obj = @{
|
||||
name = [string]$el.Current.Name
|
||||
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
|
||||
automationId = [string]$el.Current.AutomationId
|
||||
boundingRect = @{
|
||||
x = [int]$rect.X
|
||||
y = [int]$rect.Y
|
||||
w = [int]$rect.Width
|
||||
h = [int]$rect.Height
|
||||
}
|
||||
isEnabled = $el.Current.IsEnabled
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
|
||||
} catch {}
|
||||
$sub = Get-UIChildren $el ($currentDepth + 1) $maxDepth
|
||||
if ($sub.Count -gt 0) { $obj['children'] = $sub }
|
||||
$result += $obj
|
||||
}
|
||||
return $result
|
||||
}
|
||||
|
||||
$tree = Get-UIChildren $window 0 ${depth}
|
||||
if ($tree -eq $null -or $tree.Count -eq 0) {
|
||||
Write-Output '[]'
|
||||
} else {
|
||||
$tree | ConvertTo-Json -Depth 20 -Compress
|
||||
}
|
||||
`
|
||||
const raw = ps(script)
|
||||
const parsed = parseJsonSafe<UIElement | UIElement[]>(raw, [])
|
||||
return Array.isArray(parsed) ? parsed : [parsed]
|
||||
}
|
||||
|
||||
/**
|
||||
* Find a single element inside a window matching the given query fields.
|
||||
*/
|
||||
export function findElement(
|
||||
windowTitle: string,
|
||||
query: { name?: string; controlType?: string; automationId?: string },
|
||||
): UIElement | null {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
|
||||
// Build conditions array
|
||||
const conditions: string[] = []
|
||||
if (query.name) {
|
||||
const v = query.name.replace(/'/g, "''")
|
||||
conditions.push(
|
||||
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::NameProperty, '${v}')`,
|
||||
)
|
||||
}
|
||||
if (query.controlType) {
|
||||
const v = query.controlType.replace(/'/g, "''")
|
||||
conditions.push(
|
||||
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`,
|
||||
)
|
||||
}
|
||||
if (query.automationId) {
|
||||
const v = query.automationId.replace(/'/g, "''")
|
||||
conditions.push(
|
||||
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${v}')`,
|
||||
)
|
||||
}
|
||||
|
||||
if (conditions.length === 0) return null
|
||||
|
||||
let conditionExpr: string
|
||||
if (conditions.length === 1) {
|
||||
conditionExpr = conditions[0]
|
||||
} else {
|
||||
conditionExpr = `[System.Windows.Automation.AndCondition]::new(@(${conditions.join(', ')}))`
|
||||
}
|
||||
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output 'null'
|
||||
exit
|
||||
}
|
||||
$cond = ${conditionExpr}
|
||||
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'null'
|
||||
exit
|
||||
}
|
||||
$rect = $el.Current.BoundingRectangle
|
||||
$obj = @{
|
||||
name = [string]$el.Current.Name
|
||||
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
|
||||
automationId = [string]$el.Current.AutomationId
|
||||
boundingRect = @{
|
||||
x = [int]$rect.X
|
||||
y = [int]$rect.Y
|
||||
w = [int]$rect.Width
|
||||
h = [int]$rect.Height
|
||||
}
|
||||
isEnabled = $el.Current.IsEnabled
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
|
||||
} catch {}
|
||||
$obj | ConvertTo-Json -Compress
|
||||
`
|
||||
const raw = ps(script)
|
||||
return parseJsonSafe<UIElement | null>(raw, null)
|
||||
}
|
||||
|
||||
/**
|
||||
* Click an element by its automationId using InvokePattern.
|
||||
*/
|
||||
export function clickElement(windowTitle: string, automationId: string): boolean {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
const escapedId = automationId.replace(/'/g, "''")
|
||||
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
$cond = [System.Windows.Automation.PropertyCondition]::new(
|
||||
[System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}')
|
||||
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
try {
|
||||
$ip = $el.GetCurrentPattern([System.Windows.Automation.InvokePattern]::Pattern)
|
||||
$ip.Invoke()
|
||||
Write-Output 'true'
|
||||
} catch {
|
||||
Write-Output 'false'
|
||||
}
|
||||
`
|
||||
return ps(script) === 'true'
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the value of an element by its automationId using ValuePattern.
|
||||
*/
|
||||
export function setValue(windowTitle: string, automationId: string, value: string): boolean {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
const escapedId = automationId.replace(/'/g, "''")
|
||||
const escapedValue = value.replace(/'/g, "''")
|
||||
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
$cond = [System.Windows.Automation.PropertyCondition]::new(
|
||||
[System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}')
|
||||
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
$vp.SetValue('${escapedValue}')
|
||||
Write-Output 'true'
|
||||
} catch {
|
||||
Write-Output 'false'
|
||||
}
|
||||
`
|
||||
return ps(script) === 'true'
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the UI element at a specific screen coordinate.
|
||||
*/
|
||||
export function elementAtPoint(x: number, y: number): UIElement | null {
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
try {
|
||||
$point = [System.Windows.Point]::new(${x}, ${y})
|
||||
$el = [System.Windows.Automation.AutomationElement]::FromPoint($point)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'null'
|
||||
exit
|
||||
}
|
||||
$rect = $el.Current.BoundingRectangle
|
||||
$obj = @{
|
||||
name = [string]$el.Current.Name
|
||||
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
|
||||
automationId = [string]$el.Current.AutomationId
|
||||
boundingRect = @{
|
||||
x = [int]$rect.X
|
||||
y = [int]$rect.Y
|
||||
w = [int]$rect.Width
|
||||
h = [int]$rect.Height
|
||||
}
|
||||
isEnabled = $el.Current.IsEnabled
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
|
||||
} catch {}
|
||||
$obj | ConvertTo-Json -Compress
|
||||
} catch {
|
||||
Write-Output 'null'
|
||||
}
|
||||
`
|
||||
const raw = ps(script)
|
||||
return parseJsonSafe<UIElement | null>(raw, null)
|
||||
}
|
||||
129
src/utils/computerUse/win32/windowCapture.ts
Normal file
129
src/utils/computerUse/win32/windowCapture.ts
Normal file
@@ -0,0 +1,129 @@
|
||||
/**
|
||||
* Window-level screenshot capture using Win32 PrintWindow API.
|
||||
* Captures windows even when occluded or minimized.
|
||||
*/
|
||||
|
||||
interface CaptureResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
const CAPTURE_BY_TITLE_PS = `
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
Add-Type -ReferencedAssemblies System.Drawing @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
public class WinCap {
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||
public static extern IntPtr FindWindow(string c, string t);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f);
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct RECT { public int L, T, R, B; }
|
||||
|
||||
public static string Capture(string title) {
|
||||
IntPtr hwnd = FindWindow(null, title);
|
||||
if (hwnd == IntPtr.Zero) return "NOT_FOUND";
|
||||
RECT r; GetWindowRect(hwnd, out r);
|
||||
int w = r.R - r.L; int h = r.B - r.T;
|
||||
if (w <= 0 || h <= 0) return "INVALID_SIZE";
|
||||
Bitmap bmp = new Bitmap(w, h);
|
||||
Graphics g = Graphics.FromImage(bmp);
|
||||
IntPtr hdc = g.GetHdc();
|
||||
PrintWindow(hwnd, hdc, 2);
|
||||
g.ReleaseHdc(hdc); g.Dispose();
|
||||
var ms = new System.IO.MemoryStream();
|
||||
bmp.Save(ms, ImageFormat.Png);
|
||||
bmp.Dispose();
|
||||
return w + "," + h + "," + Convert.ToBase64String(ms.ToArray());
|
||||
}
|
||||
}
|
||||
'@
|
||||
`
|
||||
|
||||
const CAPTURE_BY_HWND_PS = `
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
Add-Type -ReferencedAssemblies System.Drawing @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
public class WinCapH {
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool IsWindow(IntPtr hWnd);
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct RECT { public int L, T, R, B; }
|
||||
|
||||
public static string Capture(IntPtr hwnd) {
|
||||
if (!IsWindow(hwnd)) return "NOT_FOUND";
|
||||
RECT r; GetWindowRect(hwnd, out r);
|
||||
int w = r.R - r.L; int h = r.B - r.T;
|
||||
if (w <= 0 || h <= 0) return "INVALID_SIZE";
|
||||
Bitmap bmp = new Bitmap(w, h);
|
||||
Graphics g = Graphics.FromImage(bmp);
|
||||
IntPtr hdc = g.GetHdc();
|
||||
PrintWindow(hwnd, hdc, 2);
|
||||
g.ReleaseHdc(hdc); g.Dispose();
|
||||
var ms = new System.IO.MemoryStream();
|
||||
bmp.Save(ms, ImageFormat.Png);
|
||||
bmp.Dispose();
|
||||
return w + "," + h + "," + Convert.ToBase64String(ms.ToArray());
|
||||
}
|
||||
}
|
||||
'@
|
||||
`
|
||||
|
||||
function parseCaptureOutput(raw: string): CaptureResult | null {
|
||||
const trimmed = raw.trim()
|
||||
if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') {
|
||||
return null
|
||||
}
|
||||
const firstComma = trimmed.indexOf(',')
|
||||
const secondComma = trimmed.indexOf(',', firstComma + 1)
|
||||
if (firstComma === -1 || secondComma === -1) return null
|
||||
|
||||
const width = Number(trimmed.slice(0, firstComma))
|
||||
const height = Number(trimmed.slice(firstComma + 1, secondComma))
|
||||
const base64 = trimmed.slice(secondComma + 1)
|
||||
|
||||
if (!width || !height || !base64) return null
|
||||
return { base64, width, height }
|
||||
}
|
||||
|
||||
function runPs(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Capture a window screenshot by its exact title.
|
||||
* Uses PrintWindow which works even for occluded/background windows.
|
||||
*/
|
||||
export function captureWindow(title: string): CaptureResult | null {
|
||||
const escaped = title.replace(/'/g, "''")
|
||||
const script = `${CAPTURE_BY_TITLE_PS}\n[WinCap]::Capture('${escaped}')`
|
||||
const raw = runPs(script)
|
||||
return parseCaptureOutput(raw)
|
||||
}
|
||||
|
||||
/**
|
||||
* Capture a window screenshot by its HWND handle.
|
||||
*/
|
||||
export function captureWindowByHwnd(hwnd: number): CaptureResult | null {
|
||||
const script = `${CAPTURE_BY_HWND_PS}\n[WinCapH]::Capture([IntPtr]::new(${hwnd}))`
|
||||
const raw = runPs(script)
|
||||
return parseCaptureOutput(raw)
|
||||
}
|
||||
86
src/utils/computerUse/win32/windowEnum.ts
Normal file
86
src/utils/computerUse/win32/windowEnum.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
/**
|
||||
* Window enumeration using Win32 EnumWindows API.
|
||||
* Returns visible windows with their HWND, PID, and title.
|
||||
*/
|
||||
|
||||
export interface WindowInfo {
|
||||
hwnd: number
|
||||
pid: number
|
||||
title: string
|
||||
}
|
||||
|
||||
const ENUM_WINDOWS_PS = `
|
||||
Add-Type @'
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
public class WinEnum {
|
||||
public delegate bool EnumWindowsProc(IntPtr hWnd, IntPtr lParam);
|
||||
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool EnumWindows(EnumWindowsProc lpEnumFunc, IntPtr lParam);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool IsWindowVisible(IntPtr hWnd);
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||
public static extern int GetWindowText(IntPtr hWnd, StringBuilder lpString, int nMaxCount);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern int GetWindowTextLength(IntPtr hWnd);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint processId);
|
||||
|
||||
public static List<string> results = new List<string>();
|
||||
|
||||
public static void Run() {
|
||||
results.Clear();
|
||||
EnumWindows(delegate(IntPtr hWnd, IntPtr lParam) {
|
||||
if (!IsWindowVisible(hWnd)) return true;
|
||||
int len = GetWindowTextLength(hWnd);
|
||||
if (len == 0) return true;
|
||||
StringBuilder sb = new StringBuilder(len + 1);
|
||||
GetWindowText(hWnd, sb, sb.Capacity);
|
||||
string title = sb.ToString();
|
||||
if (string.IsNullOrWhiteSpace(title)) return true;
|
||||
uint pid = 0;
|
||||
GetWindowThreadProcessId(hWnd, out pid);
|
||||
results.Add(hWnd.ToInt64() + "|" + pid + "|" + title);
|
||||
return true;
|
||||
}, IntPtr.Zero);
|
||||
}
|
||||
}
|
||||
'@
|
||||
[WinEnum]::Run()
|
||||
[WinEnum]::results | ForEach-Object { $_ }
|
||||
`
|
||||
|
||||
/**
|
||||
* List all visible windows with non-empty titles.
|
||||
* Returns HWND, PID, and window title for each.
|
||||
*/
|
||||
export function listWindows(): WindowInfo[] {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
const raw = new TextDecoder().decode(result.stdout).trim()
|
||||
if (!raw) return []
|
||||
|
||||
return raw
|
||||
.split('\n')
|
||||
.filter(Boolean)
|
||||
.map(line => {
|
||||
const trimmed = line.trim()
|
||||
const firstPipe = trimmed.indexOf('|')
|
||||
const secondPipe = trimmed.indexOf('|', firstPipe + 1)
|
||||
if (firstPipe === -1 || secondPipe === -1) return null
|
||||
|
||||
const hwnd = Number(trimmed.slice(0, firstPipe))
|
||||
const pid = Number(trimmed.slice(firstPipe + 1, secondPipe))
|
||||
const title = trimmed.slice(secondPipe + 1)
|
||||
|
||||
if (isNaN(hwnd) || isNaN(pid) || !title) return null
|
||||
return { hwnd, pid, title }
|
||||
})
|
||||
.filter((item): item is WindowInfo => item !== null)
|
||||
}
|
||||
Reference in New Issue
Block a user