feat: enable Computer Use with macOS + Windows + Linux support

Phase 1: Replace @ant/computer-use-mcp stub (12 files, 6517 lines).

Phase 2: Remove 8 macOS-only guards in src/:
- main.tsx: remove getPlatform()==='macos' check
- swiftLoader.ts: remove darwin-only throw
- executor.ts: extend platform guard, clipboard dispatch, paste key
- drainRunLoop.ts: skip CFRunLoop pump on non-darwin
- escHotkey.ts: non-darwin returns false (Ctrl+C fallback)
- hostAdapter.ts: non-darwin permissions granted
- common.ts: dynamic platform + screenshotFiltering
- gates.ts: enabled:true, subscription check removed

Phase 3: Add Linux backends (xdotool/scrot/xrandr/wmctrl):
- computer-use-input/backends/linux.ts (173 lines)
- computer-use-swift/backends/linux.ts (278 lines)

Verified on Windows x64: mouse, screenshot, displays, foreground app.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
unraid
2026-04-03 22:33:00 +08:00
parent 465e9f01c6
commit e3264a1691
34 changed files with 8291 additions and 750 deletions

View File

@@ -0,0 +1,258 @@
/**
* macOS backend for computer-use-swift
*
* Uses AppleScript/JXA/screencapture for display info, app management,
* and screenshots.
*/
import { readFileSync, unlinkSync } from 'fs'
import { tmpdir } from 'os'
import { join } from 'path'
import type {
AppInfo, AppsAPI, DisplayAPI, DisplayGeometry, InstalledApp,
PrepareDisplayResult, RunningApp, ScreenshotAPI, ScreenshotResult,
SwiftBackend, WindowDisplayInfo,
} from '../types.js'
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function jxaSync(script: string): string {
const result = Bun.spawnSync({
cmd: ['osascript', '-l', 'JavaScript', '-e', script],
stdout: 'pipe', stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
function osascriptSync(script: string): string {
const result = Bun.spawnSync({
cmd: ['osascript', '-e', script],
stdout: 'pipe', stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
async function osascript(script: string): Promise<string> {
const proc = Bun.spawn(['osascript', '-e', script], {
stdout: 'pipe', stderr: 'pipe',
})
const text = await new Response(proc.stdout).text()
await proc.exited
return text.trim()
}
async function jxa(script: string): Promise<string> {
const proc = Bun.spawn(['osascript', '-l', 'JavaScript', '-e', script], {
stdout: 'pipe', stderr: 'pipe',
})
const text = await new Response(proc.stdout).text()
await proc.exited
return text.trim()
}
// ---------------------------------------------------------------------------
// DisplayAPI
// ---------------------------------------------------------------------------
export const display: DisplayAPI = {
getSize(displayId?: number): DisplayGeometry {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }
},
listAll(): DisplayGeometry[] {
try {
const raw = jxaSync(`
ObjC.import("CoreGraphics");
var displays = $.CGDisplayCopyAllDisplayModes ? [] : [];
var active = $.CGGetActiveDisplayList(10, null, Ref());
var countRef = Ref();
$.CGGetActiveDisplayList(0, null, countRef);
var count = countRef[0];
var idBuf = Ref();
$.CGGetActiveDisplayList(count, idBuf, countRef);
var result = [];
for (var i = 0; i < count; i++) {
var did = idBuf[i];
var w = $.CGDisplayPixelsWide(did);
var h = $.CGDisplayPixelsHigh(did);
var mode = $.CGDisplayCopyDisplayMode(did);
var pw = $.CGDisplayModeGetPixelWidth(mode);
var sf = pw > 0 && w > 0 ? pw / w : 2;
result.push({width: w, height: h, scaleFactor: sf, displayId: did});
}
JSON.stringify(result);
`)
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
width: Number(d.width), height: Number(d.height),
scaleFactor: Number(d.scaleFactor), displayId: Number(d.displayId),
}))
} catch {
try {
const raw = jxaSync(`
ObjC.import("AppKit");
var screens = $.NSScreen.screens;
var result = [];
for (var i = 0; i < screens.count; i++) {
var s = screens.objectAtIndex(i);
var frame = s.frame;
var desc = s.deviceDescription;
var screenNumber = desc.objectForKey($("NSScreenNumber")).intValue;
var backingFactor = s.backingScaleFactor;
result.push({
width: Math.round(frame.size.width),
height: Math.round(frame.size.height),
scaleFactor: backingFactor,
displayId: screenNumber
});
}
JSON.stringify(result);
`)
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
width: Number(d.width), height: Number(d.height),
scaleFactor: Number(d.scaleFactor), displayId: Number(d.displayId),
}))
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }]
}
}
},
}
// ---------------------------------------------------------------------------
// AppsAPI
// ---------------------------------------------------------------------------
export const apps: AppsAPI = {
async prepareDisplay(_allowlistBundleIds, _surrogateHost, _displayId) {
return { activated: '', hidden: [] }
},
async previewHideSet(_bundleIds, _displayId) {
return []
},
async findWindowDisplays(bundleIds) {
return bundleIds.map(bundleId => ({ bundleId, displayIds: [1] }))
},
async appUnderPoint(_x, _y) {
try {
const result = await jxa(`
ObjC.import("CoreGraphics");
ObjC.import("AppKit");
var pt = $.CGPointMake(${_x}, ${_y});
var app = $.NSWorkspace.sharedWorkspace.frontmostApplication;
JSON.stringify({bundleId: app.bundleIdentifier.js, displayName: app.localizedName.js});
`)
return JSON.parse(result)
} catch {
return null
}
},
async listInstalled() {
try {
const result = await osascript(`
tell application "System Events"
set appList to ""
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
set appPath to POSIX path of (appFile as alias)
set appName to name of appFile
set appList to appList & appPath & "|" & appName & "\\n"
end repeat
return appList
end tell
`)
return result.split('\n').filter(Boolean).map(line => {
const [path, name] = line.split('|', 2)
const displayName = (name ?? '').replace(/\.app$/, '')
return {
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
displayName,
path: path ?? '',
}
})
} catch {
return []
}
},
iconDataUrl(_path) {
return null
},
listRunning() {
try {
const raw = jxaSync(`
var apps = Application("System Events").applicationProcesses.whose({backgroundOnly: false});
var result = [];
for (var i = 0; i < apps.length; i++) {
try {
var a = apps[i];
result.push({bundleId: a.bundleIdentifier(), displayName: a.name()});
} catch(e) {}
}
JSON.stringify(result);
`)
return JSON.parse(raw)
} catch {
return []
}
},
async open(bundleId) {
await osascript(`tell application id "${bundleId}" to activate`)
},
async unhide(bundleIds) {
for (const bundleId of bundleIds) {
await osascript(`
tell application "System Events"
set visible of application process (name of application process whose bundle identifier is "${bundleId}") to true
end tell
`)
}
},
}
// ---------------------------------------------------------------------------
// ScreenshotAPI
// ---------------------------------------------------------------------------
async function captureScreenToBase64(args: string[]): Promise<{ base64: string; width: number; height: number }> {
const tmpFile = join(tmpdir(), `cu-screenshot-${Date.now()}.png`)
const proc = Bun.spawn(['screencapture', ...args, tmpFile], {
stdout: 'pipe', stderr: 'pipe',
})
await proc.exited
try {
const buf = readFileSync(tmpFile)
const base64 = buf.toString('base64')
const width = buf.readUInt32BE(16)
const height = buf.readUInt32BE(20)
return { base64, width, height }
} finally {
try { unlinkSync(tmpFile) } catch {}
}
}
export const screenshot: ScreenshotAPI = {
async captureExcluding(_allowedBundleIds, _quality, _targetW, _targetH, displayId) {
const args = ['-x']
if (displayId !== undefined) args.push('-D', String(displayId))
return captureScreenToBase64(args)
},
async captureRegion(_allowedBundleIds, x, y, w, h, _outW, _outH, _quality, displayId) {
const args = ['-x', '-R', `${x},${y},${w},${h}`]
if (displayId !== undefined) args.push('-D', String(displayId))
return captureScreenToBase64(args)
},
}

View File

@@ -0,0 +1,278 @@
/**
* Linux backend for computer-use-swift
*
* Uses xrandr for display info, scrot for screenshots,
* wmctrl/xdotool for window management, and xdg-open for launching apps.
*
* Requires: xrandr, scrot, xdotool, wmctrl (optional)
*/
import type {
AppInfo, AppsAPI, DisplayAPI, DisplayGeometry, InstalledApp,
PrepareDisplayResult, RunningApp, ScreenshotAPI, ScreenshotResult,
SwiftBackend, WindowDisplayInfo,
} from '../types.js'
// ---------------------------------------------------------------------------
// Shell helpers
// ---------------------------------------------------------------------------
function run(cmd: string[]): string {
const result = Bun.spawnSync({
cmd,
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
async function runAsync(cmd: string[]): Promise<string> {
const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' })
const out = await new Response(proc.stdout).text()
await proc.exited
return out.trim()
}
function commandExists(name: string): boolean {
const result = Bun.spawnSync({ cmd: ['which', name], stdout: 'pipe', stderr: 'pipe' })
return result.exitCode === 0
}
// ---------------------------------------------------------------------------
// DisplayAPI
// ---------------------------------------------------------------------------
export const display: DisplayAPI = {
getSize(displayId?: number): DisplayGeometry {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
},
listAll(): DisplayGeometry[] {
try {
const raw = run(['xrandr', '--query'])
const displays: DisplayGeometry[] = []
let idx = 0
// Match lines like: "HDMI-1 connected 1920x1080+0+0" or "eDP-1 connected primary 2560x1440+0+0"
const regex = /^\S+\s+connected\s+(?:primary\s+)?(\d+)x(\d+)\+\d+\+\d+/gm
let match: RegExpExecArray | null
while ((match = regex.exec(raw)) !== null) {
displays.push({
width: Number(match[1]),
height: Number(match[2]),
scaleFactor: 1,
displayId: idx++,
})
}
if (displays.length === 0) {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
return displays
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
},
}
// ---------------------------------------------------------------------------
// AppsAPI
// ---------------------------------------------------------------------------
export const apps: AppsAPI = {
async prepareDisplay(_allowlistBundleIds, _surrogateHost, _displayId): Promise<PrepareDisplayResult> {
return { activated: '', hidden: [] }
},
async previewHideSet(_bundleIds, _displayId): Promise<AppInfo[]> {
return []
},
async findWindowDisplays(bundleIds): Promise<WindowDisplayInfo[]> {
return bundleIds.map(bundleId => ({ bundleId, displayIds: [0] }))
},
async appUnderPoint(x, y): Promise<AppInfo | null> {
try {
// Move mouse to point, get window under cursor
const out = run(['xdotool', 'mousemove', '--sync', String(x), String(y), 'getmouselocation', '--shell'])
const windowMatch = out.match(/WINDOW=(\d+)/)
if (!windowMatch) return null
const windowId = windowMatch[1]
const pidStr = run(['xdotool', 'getwindowpid', windowId!])
if (!pidStr) return null
let exePath = ''
try { exePath = run(['readlink', '-f', `/proc/${pidStr}/exe`]) } catch { /* ignore */ }
let appName = ''
try { appName = run(['cat', `/proc/${pidStr}/comm`]) } catch { /* ignore */ }
if (!exePath && !appName) return null
return { bundleId: exePath || pidStr!, displayName: appName || 'unknown' }
} catch {
return null
}
},
async listInstalled(): Promise<InstalledApp[]> {
try {
// Read .desktop files from standard locations
const dirs = ['/usr/share/applications', '/usr/local/share/applications', `${process.env.HOME}/.local/share/applications`]
const apps: InstalledApp[] = []
for (const dir of dirs) {
let files: string
try {
files = run(['find', dir, '-name', '*.desktop', '-maxdepth', '1'])
} catch { continue }
for (const filepath of files.split('\n').filter(Boolean)) {
try {
const content = run(['cat', filepath])
const nameMatch = content.match(/^Name=(.+)$/m)
const execMatch = content.match(/^Exec=(.+)$/m)
const noDisplay = content.match(/^NoDisplay=true$/m)
if (noDisplay) continue
const name = nameMatch?.[1] ?? ''
const exec = execMatch?.[1] ?? ''
if (!name) continue
apps.push({
bundleId: filepath.split('/').pop()?.replace('.desktop', '') ?? '',
displayName: name,
path: exec.split(/\s+/)[0] ?? '',
})
} catch { /* skip unreadable files */ }
}
}
return apps.slice(0, 200)
} catch {
return []
}
},
iconDataUrl(_path): string | null {
return null
},
listRunning(): RunningApp[] {
try {
// Try wmctrl first
if (commandExists('wmctrl')) {
const raw = run(['wmctrl', '-l', '-p'])
const apps: RunningApp[] = []
for (const line of raw.split('\n').filter(Boolean)) {
// wmctrl format: "0x04000003 0 12345 hostname Window Title"
const parts = line.split(/\s+/)
const pid = parts[2]
if (!pid || pid === '0') continue
let exePath = ''
try { exePath = run(['readlink', '-f', `/proc/${pid}/exe`]) } catch { /* ignore */ }
let appName = ''
try { appName = run(['cat', `/proc/${pid}/comm`]) } catch { /* ignore */ }
if (appName) {
apps.push({ bundleId: exePath || pid, displayName: appName })
}
}
// Deduplicate by bundleId
const seen = new Set<string>()
return apps.filter(a => {
if (seen.has(a.bundleId)) return false
seen.add(a.bundleId)
return true
}).slice(0, 50)
}
// Fallback: ps with visible processes
const raw = run(['ps', '-eo', 'pid,comm', '--no-headers'])
const apps: RunningApp[] = []
for (const line of raw.split('\n').filter(Boolean).slice(0, 50)) {
const match = line.trim().match(/^(\d+)\s+(.+)$/)
if (match) {
apps.push({ bundleId: match[1]!, displayName: match[2]! })
}
}
return apps
} catch {
return []
}
},
async open(name): Promise<void> {
// Try gtk-launch first (for .desktop file names), fall back to xdg-open
try {
const desktopName = name.endsWith('.desktop') ? name : `${name}.desktop`
if (commandExists('gtk-launch')) {
await runAsync(['gtk-launch', desktopName])
return
}
} catch { /* fall through */ }
await runAsync(['xdg-open', name])
},
async unhide(bundleIds): Promise<void> {
for (const id of bundleIds) {
try {
if (commandExists('wmctrl') && id.startsWith('0x')) {
// Window ID — use wmctrl
await runAsync(['wmctrl', '-i', '-R', id])
} else {
// Try xdotool windowactivate with search by name
await runAsync(['xdotool', 'search', '--name', id, 'windowactivate'])
}
} catch { /* ignore failures for individual windows */ }
}
},
}
// ---------------------------------------------------------------------------
// ScreenshotAPI
// ---------------------------------------------------------------------------
const SCREENSHOT_PATH = '/tmp/cu-screenshot.png'
export const screenshot: ScreenshotAPI = {
async captureExcluding(_allowedBundleIds, _quality, _targetW, _targetH, _displayId): Promise<ScreenshotResult> {
try {
await runAsync(['scrot', '-o', SCREENSHOT_PATH])
// Read the file as base64
const file = Bun.file(SCREENSHOT_PATH)
const buffer = await file.arrayBuffer()
const base64 = Buffer.from(buffer).toString('base64')
// Get dimensions from display info
const size = display.getSize(_displayId)
return { base64, width: size.width, height: size.height }
} catch {
return { base64: '', width: 0, height: 0 }
}
},
async captureRegion(_allowedBundleIds, x, y, w, h, _outW, _outH, _quality, _displayId): Promise<ScreenshotResult> {
try {
// scrot -a x,y,w,h captures a specific region
await runAsync(['scrot', '-a', `${x},${y},${w},${h}`, '-o', SCREENSHOT_PATH])
const file = Bun.file(SCREENSHOT_PATH)
const buffer = await file.arrayBuffer()
const base64 = Buffer.from(buffer).toString('base64')
return { base64, width: w, height: h }
} catch {
return { base64: '', width: 0, height: 0 }
}
},
}

View File

@@ -0,0 +1,249 @@
/**
* Windows backend for computer-use-swift
*
* Uses PowerShell with .NET System.Drawing / System.Windows.Forms for
* screenshots and Win32 P/Invoke for window/process management.
*/
import type {
AppInfo, AppsAPI, DisplayAPI, DisplayGeometry, InstalledApp,
PrepareDisplayResult, RunningApp, ScreenshotAPI, ScreenshotResult,
SwiftBackend, WindowDisplayInfo,
} from '../types.js'
// ---------------------------------------------------------------------------
// PowerShell helper
// ---------------------------------------------------------------------------
function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
async function psAsync(script: string): Promise<string> {
const proc = Bun.spawn(
['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
{ stdout: 'pipe', stderr: 'pipe' },
)
const out = await new Response(proc.stdout).text()
await proc.exited
return out.trim()
}
// ---------------------------------------------------------------------------
// DisplayAPI
// ---------------------------------------------------------------------------
export const display: DisplayAPI = {
getSize(displayId?: number): DisplayGeometry {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
},
listAll(): DisplayGeometry[] {
try {
const raw = ps(`
Add-Type -AssemblyName System.Windows.Forms
$result = @()
$idx = 0
foreach ($s in [System.Windows.Forms.Screen]::AllScreens) {
$result += "$($s.Bounds.Width),$($s.Bounds.Height),$idx,$($s.Primary)"
$idx++
}
$result -join "|"
`)
return raw.split('|').filter(Boolean).map(entry => {
const [w, h, id, primary] = entry.split(',')
return {
width: Number(w),
height: Number(h),
scaleFactor: 1, // Windows DPI scaling handled at system level
displayId: Number(id),
}
})
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
},
}
// ---------------------------------------------------------------------------
// AppsAPI
// ---------------------------------------------------------------------------
export const apps: AppsAPI = {
async prepareDisplay(_allowlistBundleIds, _surrogateHost, _displayId) {
return { activated: '', hidden: [] }
},
async previewHideSet(_bundleIds, _displayId) {
return []
},
async findWindowDisplays(bundleIds) {
return bundleIds.map(bundleId => ({ bundleId, displayIds: [0] }))
},
async appUnderPoint(_x, _y) {
try {
const out = ps(`
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WinPt {
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X; public int Y; }
[DllImport("user32.dll")] public static extern IntPtr WindowFromPoint(POINT p);
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint pid);
}
'@
$pt = New-Object WinPt+POINT
$pt.X = ${_x}; $pt.Y = ${_y}
$hwnd = [WinPt]::WindowFromPoint($pt)
$pid = [uint32]0
[WinPt]::GetWindowThreadProcessId($hwnd, [ref]$pid) | Out-Null
$proc = Get-Process -Id $pid -ErrorAction SilentlyContinue
"$($proc.MainModule.FileName)|$($proc.ProcessName)"
`)
if (!out || !out.includes('|')) return null
const [exePath, name] = out.split('|', 2)
return { bundleId: exePath!, displayName: name! }
} catch {
return null
}
},
async listInstalled() {
try {
const raw = await psAsync(`
$apps = @()
$paths = @(
'HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
'HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
'HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*'
)
foreach ($p in $paths) {
Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object { $_.DisplayName } | ForEach-Object {
$apps += "$($_.DisplayName)|$($_.InstallLocation)|$($_.PSChildName)"
}
}
$apps | Select-Object -Unique | Select-Object -First 200
`)
return raw.split('\n').filter(Boolean).map(line => {
const [name, path, id] = line.split('|', 3)
return {
bundleId: id ?? name ?? '',
displayName: name ?? '',
path: path ?? '',
}
})
} catch {
return []
}
},
iconDataUrl(_path) {
return null
},
listRunning() {
try {
const raw = ps(`Get-Process | Where-Object { $_.MainWindowTitle -ne '' } | Select-Object -First 50 | ForEach-Object { "$($_.MainModule.FileName)|$($_.ProcessName)" }`)
return raw.split('\n').filter(Boolean).map(line => {
const [exePath, name] = line.split('|', 2)
return { bundleId: exePath ?? '', displayName: name ?? '' }
})
} catch {
return []
}
},
async open(name) {
// On Windows, name is the exe path (bundleId) or process name.
// Try exe path first, fall back to process name lookup.
const escaped = name.replace(/'/g, "''")
await psAsync(`
if (Test-Path '${escaped}') {
Start-Process '${escaped}'
} else {
Start-Process -FilePath '${escaped}' -ErrorAction SilentlyContinue
}`)
},
async unhide(bundleIds) {
// Windows: bring window to foreground
for (const name of bundleIds) {
await psAsync(`
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WinShow {
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr hWnd, int nCmd);
[DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr hWnd);
}
'@
$proc = Get-Process -Name "${name}" -ErrorAction SilentlyContinue | Select-Object -First 1
if ($proc) { [WinShow]::ShowWindow($proc.MainWindowHandle, 9) | Out-Null; [WinShow]::SetForegroundWindow($proc.MainWindowHandle) | Out-Null }
`)
}
},
}
// ---------------------------------------------------------------------------
// ScreenshotAPI
// ---------------------------------------------------------------------------
export const screenshot: ScreenshotAPI = {
async captureExcluding(_allowedBundleIds, _quality, _targetW, _targetH, displayId) {
const raw = await psAsync(`
Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing
$screen = if (${displayId ?? -1} -ge 0) { [System.Windows.Forms.Screen]::AllScreens[${displayId ?? 0}] } else { [System.Windows.Forms.Screen]::PrimaryScreen }
$bounds = $screen.Bounds
$bmp = New-Object System.Drawing.Bitmap($bounds.Width, $bounds.Height)
$g = [System.Drawing.Graphics]::FromImage($bmp)
$g.CopyFromScreen($bounds.Location, [System.Drawing.Point]::Empty, $bounds.Size)
$g.Dispose()
$ms = New-Object System.IO.MemoryStream
$bmp.Save($ms, [System.Drawing.Imaging.ImageFormat]::Png)
$bmp.Dispose()
$bytes = $ms.ToArray()
$ms.Dispose()
"$($bounds.Width),$($bounds.Height)," + [Convert]::ToBase64String($bytes)
`)
const firstComma = raw.indexOf(',')
const secondComma = raw.indexOf(',', firstComma + 1)
const width = Number(raw.slice(0, firstComma))
const height = Number(raw.slice(firstComma + 1, secondComma))
const base64 = raw.slice(secondComma + 1)
return { base64, width, height }
},
async captureRegion(_allowedBundleIds, x, y, w, h, _outW, _outH, _quality, _displayId) {
const raw = await psAsync(`
Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing
$bmp = New-Object System.Drawing.Bitmap(${w}, ${h})
$g = [System.Drawing.Graphics]::FromImage($bmp)
$g.CopyFromScreen(${x}, ${y}, 0, 0, (New-Object System.Drawing.Size(${w}, ${h})))
$g.Dispose()
$ms = New-Object System.IO.MemoryStream
$bmp.Save($ms, [System.Drawing.Imaging.ImageFormat]::Png)
$bmp.Dispose()
$bytes = $ms.ToArray()
$ms.Dispose()
"${w},${h}," + [Convert]::ToBase64String($bytes)
`)
const firstComma = raw.indexOf(',')
const secondComma = raw.indexOf(',', firstComma + 1)
const base64 = raw.slice(secondComma + 1)
return { base64, width: w, height: h }
},
}

View File

@@ -1,377 +1,84 @@
/**
* @ant/computer-use-swift — macOS 实现
* @ant/computer-use-swift — cross-platform display, apps, and screenshot API
*
* 用 AppleScript/JXA/screencapture 替代原始 Swift 原生模块。
* 提供显示器信息、应用管理、截图等功能。
* Platform backends:
* - darwin: AppleScript/JXA + screencapture
* - win32: PowerShell + System.Drawing + Win32 P/Invoke
*
* 仅 macOS 支持。
* Add new platforms by creating backends/<platform>.ts implementing SwiftBackend.
*/
import { readFileSync, unlinkSync } from 'fs'
import { tmpdir } from 'os'
import { join } from 'path'
// Re-export all types
export type {
DisplayGeometry,
PrepareDisplayResult,
AppInfo,
InstalledApp,
RunningApp,
ScreenshotResult,
ResolvePrepareCaptureResult,
WindowDisplayInfo,
DisplayAPI,
AppsAPI,
ScreenshotAPI,
SwiftBackend,
} from './types.js'
import type { ResolvePrepareCaptureResult, SwiftBackend } from './types.js'
// ---------------------------------------------------------------------------
// Types (exported for callers)
// Platform dispatch
// ---------------------------------------------------------------------------
export interface DisplayGeometry {
width: number
height: number
scaleFactor: number
displayId: number
}
export interface PrepareDisplayResult {
activated: string
hidden: string[]
}
export interface AppInfo {
bundleId: string
displayName: string
}
export interface InstalledApp {
bundleId: string
displayName: string
path: string
iconDataUrl?: string
}
export interface RunningApp {
bundleId: string
displayName: string
}
export interface ScreenshotResult {
base64: string
width: number
height: number
}
export interface ResolvePrepareCaptureResult {
base64: string
width: number
height: number
}
export interface WindowDisplayInfo {
bundleId: string
displayIds: number[]
}
// ---------------------------------------------------------------------------
// Helpers
// ---------------------------------------------------------------------------
function jxaSync(script: string): string {
const result = Bun.spawnSync({
cmd: ['osascript', '-l', 'JavaScript', '-e', script],
stdout: 'pipe', stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
function osascriptSync(script: string): string {
const result = Bun.spawnSync({
cmd: ['osascript', '-e', script],
stdout: 'pipe', stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
async function osascript(script: string): Promise<string> {
const proc = Bun.spawn(['osascript', '-e', script], {
stdout: 'pipe', stderr: 'pipe',
})
const text = await new Response(proc.stdout).text()
await proc.exited
return text.trim()
}
async function jxa(script: string): Promise<string> {
const proc = Bun.spawn(['osascript', '-l', 'JavaScript', '-e', script], {
stdout: 'pipe', stderr: 'pipe',
})
const text = await new Response(proc.stdout).text()
await proc.exited
return text.trim()
}
// ---------------------------------------------------------------------------
// DisplayAPI
// ---------------------------------------------------------------------------
interface DisplayAPI {
getSize(displayId?: number): DisplayGeometry
listAll(): DisplayGeometry[]
}
const displayAPI: DisplayAPI = {
getSize(displayId?: number): DisplayGeometry {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }
},
listAll(): DisplayGeometry[] {
try {
const raw = jxaSync(`
ObjC.import("CoreGraphics");
var displays = $.CGDisplayCopyAllDisplayModes ? [] : [];
var active = $.CGGetActiveDisplayList(10, null, Ref());
var countRef = Ref();
$.CGGetActiveDisplayList(0, null, countRef);
var count = countRef[0];
var idBuf = Ref();
$.CGGetActiveDisplayList(count, idBuf, countRef);
var result = [];
for (var i = 0; i < count; i++) {
var did = idBuf[i];
var w = $.CGDisplayPixelsWide(did);
var h = $.CGDisplayPixelsHigh(did);
var mode = $.CGDisplayCopyDisplayMode(did);
var pw = $.CGDisplayModeGetPixelWidth(mode);
var sf = pw > 0 && w > 0 ? pw / w : 2;
result.push({width: w, height: h, scaleFactor: sf, displayId: did});
}
JSON.stringify(result);
`)
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
width: Number(d.width), height: Number(d.height),
scaleFactor: Number(d.scaleFactor), displayId: Number(d.displayId),
}))
} catch {
// Fallback: use NSScreen via JXA
try {
const raw = jxaSync(`
ObjC.import("AppKit");
var screens = $.NSScreen.screens;
var result = [];
for (var i = 0; i < screens.count; i++) {
var s = screens.objectAtIndex(i);
var frame = s.frame;
var desc = s.deviceDescription;
var screenNumber = desc.objectForKey($("NSScreenNumber")).intValue;
var backingFactor = s.backingScaleFactor;
result.push({
width: Math.round(frame.size.width),
height: Math.round(frame.size.height),
scaleFactor: backingFactor,
displayId: screenNumber
});
}
JSON.stringify(result);
`)
return (JSON.parse(raw) as DisplayGeometry[]).map(d => ({
width: Number(d.width),
height: Number(d.height),
scaleFactor: Number(d.scaleFactor),
displayId: Number(d.displayId),
}))
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 2, displayId: 1 }]
}
}
},
}
// ---------------------------------------------------------------------------
// AppsAPI
// ---------------------------------------------------------------------------
interface AppsAPI {
prepareDisplay(allowlistBundleIds: string[], surrogateHost: string, displayId?: number): Promise<PrepareDisplayResult>
previewHideSet(bundleIds: string[], displayId?: number): Promise<AppInfo[]>
findWindowDisplays(bundleIds: string[]): Promise<WindowDisplayInfo[]>
appUnderPoint(x: number, y: number): Promise<AppInfo | null>
listInstalled(): Promise<InstalledApp[]>
iconDataUrl(path: string): string | null
listRunning(): RunningApp[]
open(bundleId: string): Promise<void>
unhide(bundleIds: string[]): Promise<void>
}
const appsAPI: AppsAPI = {
async prepareDisplay(
_allowlistBundleIds: string[],
_surrogateHost: string,
_displayId?: number,
): Promise<PrepareDisplayResult> {
return { activated: '', hidden: [] }
},
async previewHideSet(
_bundleIds: string[],
_displayId?: number,
): Promise<AppInfo[]> {
return []
},
async findWindowDisplays(bundleIds: string[]): Promise<WindowDisplayInfo[]> {
// Each running app is assumed to be on display 1
return bundleIds.map(bundleId => ({ bundleId, displayIds: [1] }))
},
async appUnderPoint(_x: number, _y: number): Promise<AppInfo | null> {
// Use JXA to find app at mouse position via accessibility
try {
const result = await jxa(`
ObjC.import("CoreGraphics");
ObjC.import("AppKit");
var pt = $.CGPointMake(${_x}, ${_y});
// Get frontmost app as a fallback
var app = $.NSWorkspace.sharedWorkspace.frontmostApplication;
JSON.stringify({bundleId: app.bundleIdentifier.js, displayName: app.localizedName.js});
`)
return JSON.parse(result)
} catch {
return null
}
},
async listInstalled(): Promise<InstalledApp[]> {
try {
const result = await osascript(`
tell application "System Events"
set appList to ""
repeat with appFile in (every file of folder "Applications" of startup disk whose name ends with ".app")
set appPath to POSIX path of (appFile as alias)
set appName to name of appFile
set appList to appList & appPath & "|" & appName & "\\n"
end repeat
return appList
end tell
`)
return result.split('\n').filter(Boolean).map(line => {
const [path, name] = line.split('|', 2)
// Derive bundleId from Info.plist would be ideal, but use path-based fallback
const displayName = (name ?? '').replace(/\.app$/, '')
return {
bundleId: `com.app.${displayName.toLowerCase().replace(/\s+/g, '-')}`,
displayName,
path: path ?? '',
}
})
} catch {
return []
}
},
iconDataUrl(_path: string): string | null {
return null
},
listRunning(): RunningApp[] {
try {
const raw = jxaSync(`
var apps = Application("System Events").applicationProcesses.whose({backgroundOnly: false});
var result = [];
for (var i = 0; i < apps.length; i++) {
try {
var a = apps[i];
result.push({bundleId: a.bundleIdentifier(), displayName: a.name()});
} catch(e) {}
}
JSON.stringify(result);
`)
return JSON.parse(raw)
} catch {
return []
}
},
async open(bundleId: string): Promise<void> {
await osascript(`tell application id "${bundleId}" to activate`)
},
async unhide(bundleIds: string[]): Promise<void> {
for (const bundleId of bundleIds) {
await osascript(`
tell application "System Events"
set visible of application process (name of application process whose bundle identifier is "${bundleId}") to true
end tell
`)
}
},
}
// ---------------------------------------------------------------------------
// ScreenshotAPI
// ---------------------------------------------------------------------------
interface ScreenshotAPI {
captureExcluding(
allowedBundleIds: string[], quality: number,
targetW: number, targetH: number, displayId?: number,
): Promise<ScreenshotResult>
captureRegion(
allowedBundleIds: string[],
x: number, y: number, w: number, h: number,
outW: number, outH: number, quality: number, displayId?: number,
): Promise<ScreenshotResult>
}
async function captureScreenToBase64(args: string[]): Promise<{ base64: string; width: number; height: number }> {
const tmpFile = join(tmpdir(), `cu-screenshot-${Date.now()}.png`)
const proc = Bun.spawn(['screencapture', ...args, tmpFile], {
stdout: 'pipe', stderr: 'pipe',
})
await proc.exited
function loadBackend(): SwiftBackend | null {
try {
const buf = readFileSync(tmpFile)
const base64 = buf.toString('base64')
// Parse PNG header for dimensions (bytes 16-23)
const width = buf.readUInt32BE(16)
const height = buf.readUInt32BE(20)
return { base64, width, height }
} finally {
try { unlinkSync(tmpFile) } catch {}
switch (process.platform) {
case 'darwin':
return require('./backends/darwin.js') as SwiftBackend
case 'win32':
return require('./backends/win32.js') as SwiftBackend
case 'linux':
return require('./backends/linux.js') as SwiftBackend
default:
return null
}
} catch {
return null
}
}
const screenshotAPI: ScreenshotAPI = {
async captureExcluding(
_allowedBundleIds: string[],
_quality: number,
_targetW: number,
_targetH: number,
displayId?: number,
): Promise<ScreenshotResult> {
const args = ['-x'] // silent
if (displayId !== undefined) {
args.push('-D', String(displayId))
}
return captureScreenToBase64(args)
},
async captureRegion(
_allowedBundleIds: string[],
x: number, y: number, w: number, h: number,
_outW: number, _outH: number, _quality: number,
displayId?: number,
): Promise<ScreenshotResult> {
const args = ['-x', '-R', `${x},${y},${w},${h}`]
if (displayId !== undefined) {
args.push('-D', String(displayId))
}
return captureScreenToBase64(args)
},
}
const backend = loadBackend()
// ---------------------------------------------------------------------------
// ComputerUseAPI — Main export
// ComputerUseAPI — Main export (preserves original class interface)
// ---------------------------------------------------------------------------
export class ComputerUseAPI {
apps: AppsAPI = appsAPI
display: DisplayAPI = displayAPI
screenshot: ScreenshotAPI = screenshotAPI
// When no backend is loaded (unsupported platform), all APIs are no-op stubs.
// These stubs should never be reached in practice — callers check isSupported
// or the feature gate before invoking.
apps = backend?.apps ?? {
async prepareDisplay() { return { activated: '', hidden: [] } },
async previewHideSet() { return [] },
async findWindowDisplays(ids: string[]) { return ids.map(b => ({ bundleId: b, displayIds: [] as number[] })) },
async appUnderPoint() { return null },
async listInstalled() { return [] },
iconDataUrl() { return null },
listRunning() { return [] },
async open() { throw new Error('computer-use-swift: no backend for this platform') },
async unhide() {},
}
display = backend?.display ?? {
getSize() { throw new Error('computer-use-swift: no backend for this platform') },
listAll() { throw new Error('computer-use-swift: no backend for this platform') },
}
screenshot = backend?.screenshot ?? {
async captureExcluding() { throw new Error('computer-use-swift: no backend for this platform') },
async captureRegion() { throw new Error('computer-use-swift: no backend for this platform') },
}
async resolvePrepareCapture(
allowedBundleIds: string[],

View File

@@ -0,0 +1,80 @@
export interface DisplayGeometry {
width: number
height: number
scaleFactor: number
displayId: number
}
export interface PrepareDisplayResult {
activated: string
hidden: string[]
}
export interface AppInfo {
bundleId: string
displayName: string
}
export interface InstalledApp {
bundleId: string
displayName: string
path: string
iconDataUrl?: string
}
export interface RunningApp {
bundleId: string
displayName: string
}
export interface ScreenshotResult {
base64: string
width: number
height: number
}
export interface ResolvePrepareCaptureResult {
base64: string
width: number
height: number
}
export interface WindowDisplayInfo {
bundleId: string
displayIds: number[]
}
export interface DisplayAPI {
getSize(displayId?: number): DisplayGeometry
listAll(): DisplayGeometry[]
}
export interface AppsAPI {
prepareDisplay(allowlistBundleIds: string[], surrogateHost: string, displayId?: number): Promise<PrepareDisplayResult>
previewHideSet(bundleIds: string[], displayId?: number): Promise<AppInfo[]>
findWindowDisplays(bundleIds: string[]): Promise<WindowDisplayInfo[]>
appUnderPoint(x: number, y: number): Promise<AppInfo | null>
listInstalled(): Promise<InstalledApp[]>
iconDataUrl(path: string): string | null
listRunning(): RunningApp[]
open(bundleId: string): Promise<void>
unhide(bundleIds: string[]): Promise<void>
}
export interface ScreenshotAPI {
captureExcluding(
allowedBundleIds: string[], quality: number,
targetW: number, targetH: number, displayId?: number,
): Promise<ScreenshotResult>
captureRegion(
allowedBundleIds: string[],
x: number, y: number, w: number, h: number,
outW: number, outH: number, quality: number, displayId?: number,
): Promise<ScreenshotResult>
}
export interface SwiftBackend {
display: DisplayAPI
apps: AppsAPI
screenshot: ScreenshotAPI
}