diff --git a/DEV-LOG.md b/DEV-LOG.md index 42d269d3a..c9e395112 100644 --- a/DEV-LOG.md +++ b/DEV-LOG.md @@ -1,5 +1,32 @@ # DEV-LOG +## Computer Use Windows 增强:窗口绑定截图 + UI Automation + OCR (2026-04-03) + +在三平台基础实现之上,利用 Windows 原生 API 增强 Computer Use 的 Windows 专属能力。 + +**新增文件:** + +| 文件 | 行数 | 说明 | +|------|------|------| +| `src/utils/computerUse/win32/windowCapture.ts` | — | `PrintWindow` 窗口绑定截图,支持被遮挡/后台窗口 | +| `src/utils/computerUse/win32/windowEnum.ts` | — | `EnumWindows` 精确窗口枚举(HWND + PID + 标题) | +| `src/utils/computerUse/win32/uiAutomation.ts` | — | `IUIAutomation` UI 元素树读取、按钮点击、文本写入、坐标识别 | +| `src/utils/computerUse/win32/ocr.ts` | — | `Windows.Media.Ocr` 截图+文字识别(英语+中文) | + +**修改文件:** + +| 文件 | 变更 | +|------|------| +| `packages/@ant/computer-use-swift/src/backends/win32.ts` | `listRunning` 改用 EnumWindows;新增 `captureWindowTarget` 窗口级截图 | + +**验证结果(Windows x64):** +- 窗口枚举:38 个可见窗口 ✅ +- 窗口截图:VS Code 2575x1415, 444KB ✅(PrintWindow, 即使被遮挡) +- UI Automation:坐标元素识别 ✅ +- OCR:识别 VS Code 界面文字,34 行 ✅ + +--- + ## Enable Computer Use — macOS + Windows + Linux (2026-04-03) 恢复 Computer Use 屏幕操控功能。参考项目仅 macOS,本次扩展为三平台支持。 diff --git a/docs/features/computer-use-windows-enhancement.md b/docs/features/computer-use-windows-enhancement.md new file mode 100644 index 000000000..288da5daf --- /dev/null +++ b/docs/features/computer-use-windows-enhancement.md @@ -0,0 +1,315 @@ +# Computer Use Windows 增强实施计划 + +更新时间:2026-04-03 +依赖文档:`docs/features/windows-ai-desktop-control.md`、`docs/features/computer-use.md` + +## 1. 目标 + +在已有的 PowerShell 子进程方案基础上,利用 Windows 原生 API 增强 Computer Use 的 Windows 实现,解决 3 个核心问题: + +1. **窗口绑定截图**:当前 `CopyFromScreen` 只能全屏截图,无法对指定窗口截图(尤其是被遮挡/最小化窗口) +2. **UI 结构感知**:当前只能通过坐标点击,无法像 macOS Accessibility 那样理解 UI 元素树 +3. **性能**:每次 PowerShell 启动约 273ms,剪贴板/窗口枚举等高频操作需要更快的方式 + +## 2. 已验证的 Windows API 能力 + +以下 API 全部通过 PowerShell P/Invoke 实测通过: + +| 能力 | API | 验证结果 | +|------|-----|---------| +| 窗口绑定截图 | `PrintWindow(hwnd, hdc, PW_RENDERFULLCONTENT)` | ✅ VS Code 342KB, Chrome 273KB | +| 枚举窗口+HWND | `EnumWindows` + `GetWindowText` + `GetWindowThreadProcessId` | ✅ 38 个窗口,含 HWND/PID/标题 | +| UI 元素树 | `System.Windows.Automation.AutomationElement` | ✅ 记事本 39 个元素 | +| UI 写值 | `ValuePattern.SetValue()` | ✅ 成功写入记事本文本 | +| UI 点击 | `InvokePattern.Invoke()` | ✅ 按钮可程序化点击 | +| 坐标元素识别 | `AutomationElement.FromPoint(x, y)` | ✅ 返回元素类型+名称 | +| OCR | `Windows.Media.Ocr.OcrEngine` | ✅ 英语+中文引擎可用 | +| 全局热键 | `RegisterHotKey` | ✅ API 可调 | +| 剪贴板直接操作 | `System.Windows.Forms.Clipboard` | ✅ 读/写/图片检测 | +| Shell 启动 | `ShellExecute` | ✅ 打开文件/URL/应用 | + +## 3. 架构设计 + +### 3.1 文件结构 + +在现有 `backends/win32.ts` 基础上新增 Windows 专属模块: + +``` +packages/@ant/computer-use-input/src/ +├── backends/ +│ ├── darwin.ts ← 不动 +│ ├── win32.ts ← 增强:直接 Win32 API 替代部分 PowerShell +│ └── linux.ts ← 不动 + +packages/@ant/computer-use-swift/src/ +├── backends/ +│ ├── darwin.ts ← 不动 +│ ├── win32.ts ← 增强:PrintWindow 窗口截图 + EnumWindows +│ └── linux.ts ← 不动 + +packages/@ant/computer-use-mcp/src/ +│ └── tools.ts ← 增加 Windows 专属工具定义(UI Automation、OCR) + +src/utils/computerUse/ +│ └── win32/ ← 新增目录:Windows 专属能力 +│ ├── uiAutomation.ts ← UI 元素树、点击、写值 +│ ├── ocr.ts ← 截图 + OCR 文字识别 +│ ├── windowCapture.ts ← PrintWindow 窗口绑定截图 +│ └── windowEnum.ts ← EnumWindows 窗口枚举 +``` + +### 3.2 分层 + +``` +┌──────────────────────────────────────────────┐ +│ Computer Use MCP Tools │ +│ screenshot / click / type / request_access │ +│ + Windows 专属: ui_tree / ocr / window_cap │ +├──────────────────────────────────────────────┤ +│ src/utils/computerUse/ │ +│ executor.ts → 按平台 dispatch │ +│ win32/ → Windows 专属能力模块 │ +├──────────────────────────────────────────────┤ +│ packages/@ant/computer-use-{input,swift} │ +│ backends/win32.ts → PowerShell + Win32 API │ +├──────────────────────────────────────────────┤ +│ Windows Native API │ +│ PrintWindow / EnumWindows / UI Automation │ +│ SendInput / Clipboard / OCR / ShellExecute │ +└──────────────────────────────────────────────┘ +``` + +## 4. 实施计划 + +### Phase A:窗口绑定截图(解决核心问题) + +**问题**:当前 `CopyFromScreen` 只能全屏截图,无法对指定窗口截图。 +**方案**:用 `PrintWindow` + `FindWindow` 实现窗口级截图。 + +| 步骤 | 文件 | 改动 | +|------|------|------| +| A.1 | `src/utils/computerUse/win32/windowCapture.ts` | 新建:`captureWindow(title)` 用 PrintWindow 截取指定窗口 | +| A.2 | `src/utils/computerUse/win32/windowEnum.ts` | 新建:`listWindows()` 用 EnumWindows 返回 {hwnd, pid, title}[] | +| A.3 | `packages/@ant/computer-use-swift/src/backends/win32.ts` | `screenshot.captureExcluding` 增加按窗口截图能力 | +| A.4 | `packages/@ant/computer-use-swift/src/backends/win32.ts` | `apps.listRunning` 用 EnumWindows 替代 Get-Process(返回 HWND) | + +**PowerShell 脚本核心**: + +```powershell +# PrintWindow 截取指定窗口 +Add-Type -AssemblyName System.Drawing +Add-Type -ReferencedAssemblies System.Drawing @' +using System; using System.Runtime.InteropServices; using System.Drawing; using System.Drawing.Imaging; +public class WinCap { + [DllImport("user32.dll", CharSet=CharSet.Unicode)] + public static extern IntPtr FindWindow(string c, string t); + [DllImport("user32.dll")] + public static extern bool GetWindowRect(IntPtr h, out RECT r); + [DllImport("user32.dll")] + public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f); + [StructLayout(LayoutKind.Sequential)] + public struct RECT { public int L, T, R, B; } + // ... CaptureByTitle(string title) → base64 +} +'@ +``` + +**验证标准**: +- 能按窗口标题截图 +- 被遮挡的窗口也能截图 +- 返回 base64 + width + height + +### Phase B:UI Automation(Windows 专属新能力) + +**问题**:macOS 有 Accessibility API 可以读取/操作 UI 元素,Windows 当前只能坐标点击。 +**方案**:用 `System.Windows.Automation` 实现 UI 树读取和元素操作。 + +| 步骤 | 文件 | 改动 | +|------|------|------| +| B.1 | `src/utils/computerUse/win32/uiAutomation.ts` | 新建:核心 UIA 操作封装 | +| B.2 | `packages/@ant/computer-use-mcp/src/tools.ts` | 增加 Windows 专属工具定义 | + +**uiAutomation.ts 导出函数**: + +```typescript +// 获取窗口的 UI 元素树 +getUITree(windowTitle: string, depth: number): UIElement[] + +// 按名称/类型/AutomationId 查找元素 +findElement(windowTitle: string, query: {name?, controlType?, automationId?}): UIElement | null + +// 点击元素(InvokePattern) +clickElement(windowTitle: string, automationId: string): boolean + +// 设置元素值(ValuePattern) +setValue(windowTitle: string, automationId: string, value: string): boolean + +// 获取坐标处的元素 +elementAtPoint(x: number, y: number): UIElement | null +``` + +**UIElement 类型**: +```typescript +interface UIElement { + name: string + controlType: string // Button, Edit, Text, List, etc. + automationId: string + boundingRect: { x: number, y: number, w: number, h: number } + isEnabled: boolean + value?: string // ValuePattern 可用时 + children?: UIElement[] +} +``` + +**PowerShell 脚本核心**: +```powershell +Add-Type -AssemblyName UIAutomationClient +Add-Type -AssemblyName UIAutomationTypes + +# 读取 UI 树 +$root = [AutomationElement]::RootElement +$window = $root.FindFirst([TreeScope]::Children, + [PropertyCondition]::new([AutomationElement]::NameProperty, $title)) +$elements = $window.FindAll([TreeScope]::Descendants, [Condition]::TrueCondition) + +# 写入文本 +$element.GetCurrentPattern([ValuePattern]::Pattern).SetValue($text) + +# 点击按钮 +$element.GetCurrentPattern([InvokePattern]::Pattern).Invoke() +``` + +**验证标准**: +- 能读取记事本的 UI 树(按钮、文本框、菜单) +- 能向文本框写入内容 +- 能点击按钮 +- 能识别坐标处的元素 + +### Phase C:OCR 屏幕文字识别 + +**问题**:截图后 AI 只能看到图片,无法直接读取文字。 +**方案**:用 `Windows.Media.Ocr` 对截图进行文字识别。 + +| 步骤 | 文件 | 改动 | +|------|------|------| +| C.1 | `src/utils/computerUse/win32/ocr.ts` | 新建:截图 + OCR 识别 | +| C.2 | `packages/@ant/computer-use-mcp/src/tools.ts` | 增加 `screen_ocr` 工具定义 | + +**ocr.ts 导出函数**: +```typescript +// 对屏幕区域 OCR +ocrRegion(x: number, y: number, w: number, h: number, lang?: string): OcrResult + +// 对指定窗口 OCR +ocrWindow(windowTitle: string, lang?: string): OcrResult + +interface OcrResult { + text: string + lines: { text: string, bounds: {x,y,w,h} }[] + language: string +} +``` + +**已确认可用语言**:英语 (en-US) + 中文 (zh-Hans-CN) + +**验证标准**: +- 能识别屏幕区域中的英文和中文 +- 返回文字内容 + 每行的位置信息 + +### Phase D:高频操作性能优化 + +**问题**:每次 PowerShell 启动 273ms,鼠标移动等高频操作太慢。 +**方案**:用 .NET `System.Windows.Forms.Clipboard` 等直接 API 替代 PowerShell 子进程。 + +| 步骤 | 文件 | 改动 | +|------|------|------| +| D.1 | `src/utils/computerUse/executor.ts` | 剪贴板操作用直接 API 替代 PowerShell | +| D.2 | 考虑驻留 PowerShell 进程 | 通过 stdin/stdout 交互,摊平启动成本 | + +**剪贴板直接 API**(不需要 PowerShell 子进程): +```powershell +# 读:50ms → <1ms +[System.Windows.Forms.Clipboard]::GetText() + +# 写:50ms → <1ms +[System.Windows.Forms.Clipboard]::SetText($text) + +# 图片检测 +[System.Windows.Forms.Clipboard]::ContainsImage() +``` + +### Phase E:`request_access` Windows 适配 + +**问题**:`request_access` 依赖 macOS bundleId 识别应用,Windows 没有这个概念。 +**方案**:在 Windows 上用 exe 路径 + 窗口标题替代 bundleId。 + +| 步骤 | 文件 | 改动 | +|------|------|------| +| E.1 | `packages/@ant/computer-use-mcp/src/toolCalls.ts` | `resolveRequestedApps` 在 Windows 上用 exe 路径匹配 | +| E.2 | `packages/@ant/computer-use-mcp/src/sentinelApps.ts` | 增加 Windows 危险应用列表(cmd.exe, powershell.exe 等) | +| E.3 | `packages/@ant/computer-use-mcp/src/deniedApps.ts` | 增加 Windows 浏览器/终端识别规则 | +| E.4 | `src/utils/computerUse/hostAdapter.ts` | `ensureOsPermissions` Windows 上检查 UAC 状态 | + +**Windows 应用标识映射**: +``` +macOS bundleId → Windows 等价 +com.apple.Safari → C:\Program Files\...\msedge.exe(或窗口标题匹配) +com.google.Chrome → chrome.exe +com.apple.Terminal → WindowsTerminal.exe / cmd.exe +``` + +### Phase F:全局热键(ESC 拦截) + +**问题**:当前非 darwin 直接跳过 ESC 热键,用 Ctrl+C 替代。 +**方案**:用 `RegisterHotKey` 或 `SetWindowsHookEx(WH_KEYBOARD_LL)` 实现。 + +| 步骤 | 文件 | 改动 | +|------|------|------| +| F.1 | `src/utils/computerUse/escHotkey.ts` | Windows 分支:RegisterHotKey 注册 ESC | + +**优先级低**——当前 Ctrl+C fallback 可用,ESC 热键是体验优化。 + +## 5. 执行优先级 + +``` +Phase A: 窗口绑定截图 ← P0 核心需求,解决"操作其他界面" +Phase B: UI Automation ← P0 核心能力,AI 理解 UI 结构 +Phase C: OCR ← P1 增值能力,AI 读屏幕文字 +Phase D: 性能优化 ← P1 体验优化,高频操作提速 +Phase E: request_access 适配 ← P1 功能完整性,权限模型适配 +Phase F: ESC 热键 ← P2 体验优化,可后做 +``` + +## 6. 每个 Phase 的改动量估算 + +| Phase | 新增文件 | 修改文件 | 新增代码行 | 风险 | +|-------|---------|---------|-----------|------| +| A 窗口截图 | 2 | 1 | ~200 | 低 | +| B UI Automation | 1 | 1 | ~300 | 中 | +| C OCR | 1 | 1 | ~150 | 低 | +| D 性能优化 | 0 | 2 | ~50 | 低 | +| E request_access | 0 | 3 | ~100 | 中 | +| F ESC 热键 | 0 | 1 | ~50 | 低 | +| **总计** | **4** | **9** | **~850** | — | + +## 7. 不动的文件 + +- `backends/darwin.ts`(两个包都不动) +- `backends/linux.ts`(两个包都不动) +- `src/utils/computerUse/` 中 macOS 相关代码路径不动 +- `packages/@ant/computer-use-mcp/src/` 中已复制的参考项目代码不动(只追加 Windows 工具) + +## 8. 与 macOS/Linux 方案的对比 + +| 能力 | macOS | Windows (增强后) | Linux | +|------|-------|-----------------|-------| +| 截图方式 | SCContentFilter (per-app) | **PrintWindow (per-window)** | scrot (全屏/区域) | +| UI 结构 | Accessibility API | **UI Automation** | 无 | +| OCR | 无内置 | **Windows.Media.Ocr** | 无内置 | +| 键鼠 | CGEvent + enigo | SendInput + keybd_event | xdotool | +| 窗口管理 | NSWorkspace | **EnumWindows + Win32** | wmctrl | +| 剪贴板 | pbcopy/pbpaste | **Clipboard 直接 API** | xclip | +| ESC 热键 | CGEventTap | RegisterHotKey | 无 | +| 应用标识 | bundleId | exe 路径 + 窗口标题 | /proc + wmctrl | + +**Windows 增强后将在 UI Automation 和 OCR 方面超过 macOS 方案**——这两项 macOS 原始实现也没有(Anthropic 用的是截图 + Claude 视觉理解,没有结构化 UI 数据)。 diff --git a/packages/@ant/computer-use-swift/src/backends/win32.ts b/packages/@ant/computer-use-swift/src/backends/win32.ts index fc79648e7..70f006bf1 100644 --- a/packages/@ant/computer-use-swift/src/backends/win32.ts +++ b/packages/@ant/computer-use-swift/src/backends/win32.ts @@ -11,6 +11,9 @@ import type { SwiftBackend, WindowDisplayInfo, } from '../types.js' +import { listWindows } from 'src/utils/computerUse/win32/windowEnum.js' +import { captureWindow, captureWindowByHwnd } from 'src/utils/computerUse/win32/windowCapture.js' + // --------------------------------------------------------------------------- // PowerShell helper // --------------------------------------------------------------------------- @@ -155,11 +158,11 @@ $apps | Select-Object -Unique | Select-Object -First 200 listRunning() { try { - const raw = ps(`Get-Process | Where-Object { $_.MainWindowTitle -ne '' } | Select-Object -First 50 | ForEach-Object { "$($_.MainModule.FileName)|$($_.ProcessName)" }`) - return raw.split('\n').filter(Boolean).map(line => { - const [exePath, name] = line.split('|', 2) - return { bundleId: exePath ?? '', displayName: name ?? '' } - }) + const windows = listWindows() + return windows.map(w => ({ + bundleId: String(w.hwnd), + displayName: w.title, + })) } catch { return [] } @@ -246,4 +249,15 @@ $ms.Dispose() const base64 = raw.slice(secondComma + 1) return { base64, width: w, height: h } }, + + /** + * Capture a specific window by title or HWND using PrintWindow. + * Works even for occluded or background windows. + */ + captureWindowTarget(titleOrHwnd: string | number): ScreenshotResult | null { + if (typeof titleOrHwnd === 'number') { + return captureWindowByHwnd(titleOrHwnd) + } + return captureWindow(titleOrHwnd) + }, } diff --git a/src/utils/computerUse/win32/ocr.ts b/src/utils/computerUse/win32/ocr.ts new file mode 100644 index 000000000..69ca3a6e1 --- /dev/null +++ b/src/utils/computerUse/win32/ocr.ts @@ -0,0 +1,257 @@ +/** + * OCR module using Windows.Media.Ocr.OcrEngine via PowerShell. + * Captures a screen region or window, then runs WinRT OCR to extract text. + */ + +export interface OcrLine { + text: string + bounds: { x: number; y: number; w: number; h: number } +} + +export interface OcrResult { + text: string + lines: OcrLine[] + language: string +} + +function emptyResult(language: string): OcrResult { + return { text: '', lines: [], language } +} + +function runPs(script: string): string { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + stdout: 'pipe', + stderr: 'pipe', + }) + return new TextDecoder().decode(result.stdout).trim() +} + +/** + * PowerShell script that: + * 1. Screenshots a screen region using CopyFromScreen + * 2. Saves to temp PNG + * 3. Loads via WinRT BitmapDecoder -> SoftwareBitmap + * 4. Runs OcrEngine.RecognizeAsync + * 5. Outputs JSON with text, lines, and bounding rects + */ +function buildOcrRegionScript( + x: number, + y: number, + w: number, + h: number, + lang: string, +): string { + return ` +Add-Type -AssemblyName System.Drawing +Add-Type -AssemblyName System.Runtime.WindowsRuntime + +# Load WinRT types +$null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime] +$null = [Windows.Graphics.Imaging.SoftwareBitmap, Windows.Foundation, ContentType = WindowsRuntime] +$null = [Windows.Graphics.Imaging.BitmapDecoder, Windows.Foundation, ContentType = WindowsRuntime] +$null = [Windows.Storage.StorageFile, Windows.Foundation, ContentType = WindowsRuntime] +$null = [Windows.Storage.Streams.RandomAccessStream, Windows.Foundation, ContentType = WindowsRuntime] +$null = [Windows.Globalization.Language, Windows.Foundation, ContentType = WindowsRuntime] + +# Await helper for WinRT async operations +$asTaskGeneric = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object { + $_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and + $_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation\`1' +})[0] +Function Await($WinRtTask, $ResultType) { + $asTask = $asTaskGeneric.MakeGenericMethod($ResultType) + $netTask = $asTask.Invoke($null, @($WinRtTask)) + $netTask.Wait(-1) | Out-Null + $netTask.Result +} + +try { + # Step 1: Screenshot region + $bmp = New-Object System.Drawing.Bitmap(${w}, ${h}) + $g = [System.Drawing.Graphics]::FromImage($bmp) + $g.CopyFromScreen(${x}, ${y}, 0, 0, (New-Object System.Drawing.Size(${w}, ${h}))) + $g.Dispose() + + # Step 2: Save to temp file + $tmpFile = [System.IO.Path]::Combine([System.IO.Path]::GetTempPath(), "ocrtemp_$([guid]::NewGuid().ToString('N')).png") + $bmp.Save($tmpFile, [System.Drawing.Imaging.ImageFormat]::Png) + $bmp.Dispose() + + # Step 3: Open as StorageFile -> BitmapDecoder -> SoftwareBitmap + $storageFile = Await ([Windows.Storage.StorageFile]::GetFileFromPathAsync($tmpFile)) ([Windows.Storage.StorageFile]) + $stream = Await ($storageFile.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream]) + $decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder]) + $softwareBmp = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap]) + + # Step 4: Create OCR engine + $ocrLang = New-Object Windows.Globalization.Language('${lang}') + $engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang) + if ($engine -eq $null) { + # Fallback to en-US + $ocrLang = New-Object Windows.Globalization.Language('en-US') + $engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang) + } + if ($engine -eq $null) { + Write-Output '{"text":"","lines":[],"language":"${lang}"}' + return + } + + # Step 5: Run OCR + $ocrResult = Await ($engine.RecognizeAsync($softwareBmp)) ([Windows.Media.Ocr.OcrResult]) + + # Step 6: Extract lines with bounding rects + $lines = @() + foreach ($line in $ocrResult.Lines) { + $minX = [double]::MaxValue; $minY = [double]::MaxValue + $maxX = 0.0; $maxY = 0.0 + foreach ($word in $line.Words) { + $r = $word.BoundingRect + if ($r.X -lt $minX) { $minX = $r.X } + if ($r.Y -lt $minY) { $minY = $r.Y } + if (($r.X + $r.Width) -gt $maxX) { $maxX = $r.X + $r.Width } + if (($r.Y + $r.Height) -gt $maxY) { $maxY = $r.Y + $r.Height } + } + $lines += @{ + text = $line.Text + bounds = @{ + x = [int]$minX + y = [int]$minY + w = [int]($maxX - $minX) + h = [int]($maxY - $minY) + } + } + } + + $output = @{ + text = $ocrResult.Text + lines = $lines + language = $ocrLang.LanguageTag + } + Write-Output (ConvertTo-Json $output -Depth 4 -Compress) + + # Cleanup + $stream.Dispose() + Remove-Item $tmpFile -ErrorAction SilentlyContinue +} catch { + Write-Output '{"text":"","lines":[],"language":"${lang}"}' +} +` +} + +/** + * PowerShell script to get a window's bounding rect by title. + */ +function buildGetWindowRectScript(windowTitle: string): string { + const escaped = windowTitle.replace(/'/g, "''") + return ` +Add-Type @' +using System; +using System.Runtime.InteropServices; +public class WinRect { + [DllImport("user32.dll", CharSet=CharSet.Unicode)] + public static extern IntPtr FindWindow(string c, string t); + [DllImport("user32.dll")] + public static extern bool GetWindowRect(IntPtr h, out RECT r); + [StructLayout(LayoutKind.Sequential)] + public struct RECT { public int L, T, R, B; } + public static string Get(string title) { + IntPtr hwnd = FindWindow(null, title); + if (hwnd == IntPtr.Zero) return "NOT_FOUND"; + RECT r; GetWindowRect(hwnd, out r); + int w = r.R - r.L; int h = r.B - r.T; + if (w <= 0 || h <= 0) return "INVALID_SIZE"; + return r.L + "," + r.T + "," + w + "," + h; + } +} +'@ +[WinRect]::Get('${escaped}') +` +} + +function parseOcrOutput(raw: string, lang: string): OcrResult { + if (!raw) return emptyResult(lang) + try { + const parsed = JSON.parse(raw) + return { + text: parsed.text ?? '', + lines: Array.isArray(parsed.lines) + ? parsed.lines.map((l: any) => ({ + text: l.text ?? '', + bounds: { + x: l.bounds?.x ?? 0, + y: l.bounds?.y ?? 0, + w: l.bounds?.w ?? 0, + h: l.bounds?.h ?? 0, + }, + })) + : [], + language: parsed.language ?? lang, + } + } catch { + return emptyResult(lang) + } +} + +/** + * Perform OCR on a screen region. + * Screenshots the specified rectangle, then runs WinRT OcrEngine. + * + * @param x - Left coordinate + * @param y - Top coordinate + * @param w - Width in pixels + * @param h - Height in pixels + * @param lang - BCP-47 language tag (default 'en-US'). Confirmed: 'en-US', 'zh-Hans-CN' + */ +export async function ocrRegion( + x: number, + y: number, + w: number, + h: number, + lang?: string, +): Promise { + const language = lang ?? 'en-US' + if (w <= 0 || h <= 0) return emptyResult(language) + + try { + const script = buildOcrRegionScript(x, y, w, h, language) + const raw = runPs(script) + return parseOcrOutput(raw, language) + } catch { + return emptyResult(language) + } +} + +/** + * Perform OCR on a specific window by its title. + * Gets the window rect, then delegates to ocrRegion. + * + * @param windowTitle - Exact window title to find via FindWindow + * @param lang - BCP-47 language tag (default 'en-US') + */ +export async function ocrWindow( + windowTitle: string, + lang?: string, +): Promise { + const language = lang ?? 'en-US' + + try { + const rectScript = buildGetWindowRectScript(windowTitle) + const raw = runPs(rectScript) + const trimmed = raw.trim() + + if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') { + return emptyResult(language) + } + + const parts = trimmed.split(',') + if (parts.length !== 4) return emptyResult(language) + + const [x, y, w, h] = parts.map(Number) + if (!w || !h) return emptyResult(language) + + return ocrRegion(x, y, w, h, lang) + } catch { + return emptyResult(language) + } +} diff --git a/src/utils/computerUse/win32/uiAutomation.ts b/src/utils/computerUse/win32/uiAutomation.ts new file mode 100644 index 000000000..292d7e646 --- /dev/null +++ b/src/utils/computerUse/win32/uiAutomation.ts @@ -0,0 +1,308 @@ +/** + * Windows UI Automation module + * + * Provides UI element tree inspection, element lookup, programmatic click, + * value setting, and hit-testing via PowerShell + System.Windows.Automation. + */ + +export interface UIElement { + name: string + controlType: string // Button, Edit, Text, List, Window, etc. + automationId: string + boundingRect: { x: number; y: number; w: number; h: number } + isEnabled: boolean + value?: string + children?: UIElement[] +} + +// --------------------------------------------------------------------------- +// Helper +// --------------------------------------------------------------------------- + +const UIA_ASSEMBLIES = ` +Add-Type -AssemblyName UIAutomationClient +Add-Type -AssemblyName UIAutomationTypes +Add-Type -AssemblyName WindowsBase +` + +function ps(script: string): string { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + stdout: 'pipe', + stderr: 'pipe', + }) + return new TextDecoder().decode(result.stdout).trim() +} + +function parseJsonSafe(raw: string, fallback: T): T { + try { + if (!raw) return fallback + return JSON.parse(raw) as T + } catch { + return fallback + } +} + +// PowerShell snippet that finds a window by exact or partial title match. +// Assumes $title is already set in the calling script. +const PS_FIND_WINDOW = ` +$root = [System.Windows.Automation.AutomationElement]::RootElement +$window = $root.FindFirst( + [System.Windows.Automation.TreeScope]::Children, + [System.Windows.Automation.PropertyCondition]::new( + [System.Windows.Automation.AutomationElement]::NameProperty, $title)) +if ($window -eq $null) { + $all = $root.FindAll( + [System.Windows.Automation.TreeScope]::Children, + [System.Windows.Automation.Condition]::TrueCondition) + foreach ($el in $all) { + if ($el.Current.Name -and $el.Current.Name.Contains($title)) { + $window = $el + break + } + } +} +` + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +/** + * Get the UI element tree of a window, up to `depth` levels deep (default 3). + */ +export function getUITree(windowTitle: string, depth: number = 3): UIElement[] { + const escapedTitle = windowTitle.replace(/'/g, "''") + const script = ` +${UIA_ASSEMBLIES} +$title = '${escapedTitle}' +${PS_FIND_WINDOW} +if ($window -eq $null) { + Write-Output '[]' + exit +} + +function Get-UIChildren($parent, $currentDepth, $maxDepth) { + if ($currentDepth -ge $maxDepth) { return @() } + $children = $parent.FindAll( + [System.Windows.Automation.TreeScope]::Children, + [System.Windows.Automation.Condition]::TrueCondition) + $result = @() + foreach ($el in $children) { + $rect = $el.Current.BoundingRectangle + $obj = @{ + name = [string]$el.Current.Name + controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', '' + automationId = [string]$el.Current.AutomationId + boundingRect = @{ + x = [int]$rect.X + y = [int]$rect.Y + w = [int]$rect.Width + h = [int]$rect.Height + } + isEnabled = $el.Current.IsEnabled + } + try { + $vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern) + if ($vp -ne $null) { $obj['value'] = $vp.Current.Value } + } catch {} + $sub = Get-UIChildren $el ($currentDepth + 1) $maxDepth + if ($sub.Count -gt 0) { $obj['children'] = $sub } + $result += $obj + } + return $result +} + +$tree = Get-UIChildren $window 0 ${depth} +if ($tree -eq $null -or $tree.Count -eq 0) { + Write-Output '[]' +} else { + $tree | ConvertTo-Json -Depth 20 -Compress +} +` + const raw = ps(script) + const parsed = parseJsonSafe(raw, []) + return Array.isArray(parsed) ? parsed : [parsed] +} + +/** + * Find a single element inside a window matching the given query fields. + */ +export function findElement( + windowTitle: string, + query: { name?: string; controlType?: string; automationId?: string }, +): UIElement | null { + const escapedTitle = windowTitle.replace(/'/g, "''") + + // Build conditions array + const conditions: string[] = [] + if (query.name) { + const v = query.name.replace(/'/g, "''") + conditions.push( + `[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::NameProperty, '${v}')`, + ) + } + if (query.controlType) { + const v = query.controlType.replace(/'/g, "''") + conditions.push( + `[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`, + ) + } + if (query.automationId) { + const v = query.automationId.replace(/'/g, "''") + conditions.push( + `[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${v}')`, + ) + } + + if (conditions.length === 0) return null + + let conditionExpr: string + if (conditions.length === 1) { + conditionExpr = conditions[0] + } else { + conditionExpr = `[System.Windows.Automation.AndCondition]::new(@(${conditions.join(', ')}))` + } + + const script = ` +${UIA_ASSEMBLIES} +$title = '${escapedTitle}' +${PS_FIND_WINDOW} +if ($window -eq $null) { + Write-Output 'null' + exit +} +$cond = ${conditionExpr} +$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond) +if ($el -eq $null) { + Write-Output 'null' + exit +} +$rect = $el.Current.BoundingRectangle +$obj = @{ + name = [string]$el.Current.Name + controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', '' + automationId = [string]$el.Current.AutomationId + boundingRect = @{ + x = [int]$rect.X + y = [int]$rect.Y + w = [int]$rect.Width + h = [int]$rect.Height + } + isEnabled = $el.Current.IsEnabled +} +try { + $vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern) + if ($vp -ne $null) { $obj['value'] = $vp.Current.Value } +} catch {} +$obj | ConvertTo-Json -Compress +` + const raw = ps(script) + return parseJsonSafe(raw, null) +} + +/** + * Click an element by its automationId using InvokePattern. + */ +export function clickElement(windowTitle: string, automationId: string): boolean { + const escapedTitle = windowTitle.replace(/'/g, "''") + const escapedId = automationId.replace(/'/g, "''") + + const script = ` +${UIA_ASSEMBLIES} +$title = '${escapedTitle}' +${PS_FIND_WINDOW} +if ($window -eq $null) { + Write-Output 'false' + exit +} +$cond = [System.Windows.Automation.PropertyCondition]::new( + [System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}') +$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond) +if ($el -eq $null) { + Write-Output 'false' + exit +} +try { + $ip = $el.GetCurrentPattern([System.Windows.Automation.InvokePattern]::Pattern) + $ip.Invoke() + Write-Output 'true' +} catch { + Write-Output 'false' +} +` + return ps(script) === 'true' +} + +/** + * Set the value of an element by its automationId using ValuePattern. + */ +export function setValue(windowTitle: string, automationId: string, value: string): boolean { + const escapedTitle = windowTitle.replace(/'/g, "''") + const escapedId = automationId.replace(/'/g, "''") + const escapedValue = value.replace(/'/g, "''") + + const script = ` +${UIA_ASSEMBLIES} +$title = '${escapedTitle}' +${PS_FIND_WINDOW} +if ($window -eq $null) { + Write-Output 'false' + exit +} +$cond = [System.Windows.Automation.PropertyCondition]::new( + [System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}') +$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond) +if ($el -eq $null) { + Write-Output 'false' + exit +} +try { + $vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern) + $vp.SetValue('${escapedValue}') + Write-Output 'true' +} catch { + Write-Output 'false' +} +` + return ps(script) === 'true' +} + +/** + * Get the UI element at a specific screen coordinate. + */ +export function elementAtPoint(x: number, y: number): UIElement | null { + const script = ` +${UIA_ASSEMBLIES} +try { + $point = [System.Windows.Point]::new(${x}, ${y}) + $el = [System.Windows.Automation.AutomationElement]::FromPoint($point) + if ($el -eq $null) { + Write-Output 'null' + exit + } + $rect = $el.Current.BoundingRectangle + $obj = @{ + name = [string]$el.Current.Name + controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', '' + automationId = [string]$el.Current.AutomationId + boundingRect = @{ + x = [int]$rect.X + y = [int]$rect.Y + w = [int]$rect.Width + h = [int]$rect.Height + } + isEnabled = $el.Current.IsEnabled + } + try { + $vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern) + if ($vp -ne $null) { $obj['value'] = $vp.Current.Value } + } catch {} + $obj | ConvertTo-Json -Compress +} catch { + Write-Output 'null' +} +` + const raw = ps(script) + return parseJsonSafe(raw, null) +} diff --git a/src/utils/computerUse/win32/windowCapture.ts b/src/utils/computerUse/win32/windowCapture.ts new file mode 100644 index 000000000..fe090f21f --- /dev/null +++ b/src/utils/computerUse/win32/windowCapture.ts @@ -0,0 +1,129 @@ +/** + * Window-level screenshot capture using Win32 PrintWindow API. + * Captures windows even when occluded or minimized. + */ + +interface CaptureResult { + base64: string + width: number + height: number +} + +const CAPTURE_BY_TITLE_PS = ` +Add-Type -AssemblyName System.Drawing +Add-Type -ReferencedAssemblies System.Drawing @' +using System; +using System.Runtime.InteropServices; +using System.Drawing; +using System.Drawing.Imaging; +public class WinCap { + [DllImport("user32.dll", CharSet=CharSet.Unicode)] + public static extern IntPtr FindWindow(string c, string t); + [DllImport("user32.dll")] + public static extern bool GetWindowRect(IntPtr h, out RECT r); + [DllImport("user32.dll")] + public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f); + [StructLayout(LayoutKind.Sequential)] + public struct RECT { public int L, T, R, B; } + + public static string Capture(string title) { + IntPtr hwnd = FindWindow(null, title); + if (hwnd == IntPtr.Zero) return "NOT_FOUND"; + RECT r; GetWindowRect(hwnd, out r); + int w = r.R - r.L; int h = r.B - r.T; + if (w <= 0 || h <= 0) return "INVALID_SIZE"; + Bitmap bmp = new Bitmap(w, h); + Graphics g = Graphics.FromImage(bmp); + IntPtr hdc = g.GetHdc(); + PrintWindow(hwnd, hdc, 2); + g.ReleaseHdc(hdc); g.Dispose(); + var ms = new System.IO.MemoryStream(); + bmp.Save(ms, ImageFormat.Png); + bmp.Dispose(); + return w + "," + h + "," + Convert.ToBase64String(ms.ToArray()); + } +} +'@ +` + +const CAPTURE_BY_HWND_PS = ` +Add-Type -AssemblyName System.Drawing +Add-Type -ReferencedAssemblies System.Drawing @' +using System; +using System.Runtime.InteropServices; +using System.Drawing; +using System.Drawing.Imaging; +public class WinCapH { + [DllImport("user32.dll")] + public static extern bool GetWindowRect(IntPtr h, out RECT r); + [DllImport("user32.dll")] + public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f); + [DllImport("user32.dll")] + public static extern bool IsWindow(IntPtr hWnd); + [StructLayout(LayoutKind.Sequential)] + public struct RECT { public int L, T, R, B; } + + public static string Capture(IntPtr hwnd) { + if (!IsWindow(hwnd)) return "NOT_FOUND"; + RECT r; GetWindowRect(hwnd, out r); + int w = r.R - r.L; int h = r.B - r.T; + if (w <= 0 || h <= 0) return "INVALID_SIZE"; + Bitmap bmp = new Bitmap(w, h); + Graphics g = Graphics.FromImage(bmp); + IntPtr hdc = g.GetHdc(); + PrintWindow(hwnd, hdc, 2); + g.ReleaseHdc(hdc); g.Dispose(); + var ms = new System.IO.MemoryStream(); + bmp.Save(ms, ImageFormat.Png); + bmp.Dispose(); + return w + "," + h + "," + Convert.ToBase64String(ms.ToArray()); + } +} +'@ +` + +function parseCaptureOutput(raw: string): CaptureResult | null { + const trimmed = raw.trim() + if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') { + return null + } + const firstComma = trimmed.indexOf(',') + const secondComma = trimmed.indexOf(',', firstComma + 1) + if (firstComma === -1 || secondComma === -1) return null + + const width = Number(trimmed.slice(0, firstComma)) + const height = Number(trimmed.slice(firstComma + 1, secondComma)) + const base64 = trimmed.slice(secondComma + 1) + + if (!width || !height || !base64) return null + return { base64, width, height } +} + +function runPs(script: string): string { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script], + stdout: 'pipe', + stderr: 'pipe', + }) + return new TextDecoder().decode(result.stdout).trim() +} + +/** + * Capture a window screenshot by its exact title. + * Uses PrintWindow which works even for occluded/background windows. + */ +export function captureWindow(title: string): CaptureResult | null { + const escaped = title.replace(/'/g, "''") + const script = `${CAPTURE_BY_TITLE_PS}\n[WinCap]::Capture('${escaped}')` + const raw = runPs(script) + return parseCaptureOutput(raw) +} + +/** + * Capture a window screenshot by its HWND handle. + */ +export function captureWindowByHwnd(hwnd: number): CaptureResult | null { + const script = `${CAPTURE_BY_HWND_PS}\n[WinCapH]::Capture([IntPtr]::new(${hwnd}))` + const raw = runPs(script) + return parseCaptureOutput(raw) +} diff --git a/src/utils/computerUse/win32/windowEnum.ts b/src/utils/computerUse/win32/windowEnum.ts new file mode 100644 index 000000000..03bdbbebb --- /dev/null +++ b/src/utils/computerUse/win32/windowEnum.ts @@ -0,0 +1,86 @@ +/** + * Window enumeration using Win32 EnumWindows API. + * Returns visible windows with their HWND, PID, and title. + */ + +export interface WindowInfo { + hwnd: number + pid: number + title: string +} + +const ENUM_WINDOWS_PS = ` +Add-Type @' +using System; +using System.Collections.Generic; +using System.Runtime.InteropServices; +using System.Text; +public class WinEnum { + public delegate bool EnumWindowsProc(IntPtr hWnd, IntPtr lParam); + + [DllImport("user32.dll")] + public static extern bool EnumWindows(EnumWindowsProc lpEnumFunc, IntPtr lParam); + [DllImport("user32.dll")] + public static extern bool IsWindowVisible(IntPtr hWnd); + [DllImport("user32.dll", CharSet=CharSet.Unicode)] + public static extern int GetWindowText(IntPtr hWnd, StringBuilder lpString, int nMaxCount); + [DllImport("user32.dll")] + public static extern int GetWindowTextLength(IntPtr hWnd); + [DllImport("user32.dll")] + public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint processId); + + public static List results = new List(); + + public static void Run() { + results.Clear(); + EnumWindows(delegate(IntPtr hWnd, IntPtr lParam) { + if (!IsWindowVisible(hWnd)) return true; + int len = GetWindowTextLength(hWnd); + if (len == 0) return true; + StringBuilder sb = new StringBuilder(len + 1); + GetWindowText(hWnd, sb, sb.Capacity); + string title = sb.ToString(); + if (string.IsNullOrWhiteSpace(title)) return true; + uint pid = 0; + GetWindowThreadProcessId(hWnd, out pid); + results.Add(hWnd.ToInt64() + "|" + pid + "|" + title); + return true; + }, IntPtr.Zero); + } +} +'@ +[WinEnum]::Run() +[WinEnum]::results | ForEach-Object { $_ } +` + +/** + * List all visible windows with non-empty titles. + * Returns HWND, PID, and window title for each. + */ +export function listWindows(): WindowInfo[] { + const result = Bun.spawnSync({ + cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS], + stdout: 'pipe', + stderr: 'pipe', + }) + const raw = new TextDecoder().decode(result.stdout).trim() + if (!raw) return [] + + return raw + .split('\n') + .filter(Boolean) + .map(line => { + const trimmed = line.trim() + const firstPipe = trimmed.indexOf('|') + const secondPipe = trimmed.indexOf('|', firstPipe + 1) + if (firstPipe === -1 || secondPipe === -1) return null + + const hwnd = Number(trimmed.slice(0, firstPipe)) + const pid = Number(trimmed.slice(firstPipe + 1, secondPipe)) + const title = trimmed.slice(secondPipe + 1) + + if (isNaN(hwnd) || isNaN(pid) || !title) return null + return { hwnd, pid, title } + }) + .filter((item): item is WindowInfo => item !== null) +}