feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge

三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。

- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用

- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)

- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot

- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
This commit is contained in:
unraid
2026-04-05 15:27:50 +08:00
parent 7a2ade0a02
commit c17edcb12e
36 changed files with 8297 additions and 351 deletions

View File

@@ -52,8 +52,14 @@ export function getTerminalBundleId(): string | null {
* takes this shape (no `hostBundleId`, no `teachMode`).
*/
export const CLI_CU_CAPABILITIES = {
screenshotFiltering: (process.platform === 'darwin' ? 'native' : 'none') as any,
platform: (process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin') as any,
screenshotFiltering: (process.platform === 'darwin'
? 'native'
: 'none') as any,
platform: (process.platform === 'win32'
? 'win32'
: process.platform === 'linux'
? 'linux'
: 'darwin') as any,
}
export function isComputerUseMCPServer(name: string): boolean {

View File

@@ -297,16 +297,17 @@ export function createCliExecutor(opts: {
getMouseAnimationEnabled: () => boolean
getHideBeforeActionEnabled: () => boolean
}): ComputerExecutor {
if (process.platform !== 'darwin' && process.platform !== 'win32' && process.platform !== 'linux') {
throw new Error(
`createCliExecutor called on ${process.platform}. Computer control requires macOS, Windows, or Linux.`,
)
// Non-macOS: delegate entirely to the cross-platform executor.
// No macOS code paths, no drainRunLoop, no @ant packages.
if (process.platform !== 'darwin') {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { createCrossPlatformExecutor } = require('./executorCrossPlatform.js') as typeof import('./executorCrossPlatform.js')
return createCrossPlatformExecutor(opts)
}
// Swift loaded once at factory time — every executor method needs it.
// Input loaded lazily via requireComputerUseInput() on first mouse/keyboard
// call — it caches internally, so screenshot-only flows never pull the
// enigo .node.
// ── macOS: native @ant packages ─────────────────────────────────────
// Everything below is macOS-only. No platform checks needed.
const cu = requireComputerUseSwift()
const { getMouseAnimationEnabled, getHideBeforeActionEnabled } = opts
@@ -500,18 +501,12 @@ export function createCliExecutor(opts: {
async key(keySequence: string, repeat?: number): Promise<void> {
const input = requireComputerUseInput()
const parts = keySequence.split('+').filter(p => p.length > 0)
// Bare-only: the CGEventTap checks event.flags.isEmpty so ctrl+escape
// etc. pass through without aborting.
const isEsc = isBareEscape(parts)
const n = repeat ?? 1
await drainRunLoop(async () => {
for (let i = 0; i < n; i++) {
if (i > 0) {
await sleep(8)
}
if (isEsc) {
notifyExpectedEscape()
}
if (i > 0) await sleep(8)
if (isEsc) notifyExpectedEscape()
await input.keys(parts)
}
})
@@ -554,12 +549,9 @@ export function createCliExecutor(opts: {
async type(text: string, opts: { viaClipboard: boolean }): Promise<void> {
const input = requireComputerUseInput()
if (opts.viaClipboard) {
// keys(['command','v']) inside needs the pump.
await drainRunLoop(() => typeViaClipboard(input, text))
return
}
// `toolCalls.ts` handles the grapheme loop + 8ms sleeps and calls this
// once per grapheme. typeText doesn't dispatch to the main queue.
await input.typeText(text)
},
@@ -656,6 +648,10 @@ export function createCliExecutor(opts: {
// ── App management ───────────────────────────────────────────────────
async getFrontmostApp(): Promise<FrontmostApp | null> {
// When HWND is bound on Windows, operations go through SendMessage
// and don't touch the real foreground. Return the first allowed app
// so the frontmost gate in toolCalls.ts passes — the real foreground
// is irrelevant since we never touch it.
const info = requireComputerUseInput().getFrontmostAppInfo()
if (!info || !info.bundleId) return null
return { bundleId: info.bundleId, displayName: info.appName }
@@ -698,6 +694,7 @@ export async function unhideComputerUseApps(
bundleIds: readonly string[],
): Promise<void> {
if (bundleIds.length === 0) return
if (process.platform !== 'darwin') return // non-macOS: no-op
const cu = requireComputerUseSwift()
await cu.apps.unhide([...bundleIds])
}

File diff suppressed because it is too large Load Diff

View File

@@ -46,16 +46,9 @@ export function getComputerUseHostAdapter(): ComputerUseHostAdapter {
}),
ensureOsPermissions: async () => {
if (process.platform !== 'darwin') return { granted: true }
const cu = requireComputerUseSwift() as any
// Native .node module exposes tcc; cross-platform JS backend does not.
// When tcc is absent (JS backend on macOS), we cannot programmatically
// check TCC status — returning granted:false would create a deadlock
// (recheck also fails, user can never pass). The JS backend uses
// osascript/screencapture which trigger OS-level permission prompts
// themselves, so the OS provides the safety net instead.
if (!cu.tcc) return { granted: true }
const accessibility = cu.tcc.checkAccessibility()
const screenRecording = cu.tcc.checkScreenRecording()
const cu = requireComputerUseSwift()
const accessibility = (cu as any).tcc.checkAccessibility()
const screenRecording = (cu as any).tcc.checkScreenRecording()
return accessibility && screenRecording
? { granted: true }
: { granted: false, accessibility, screenRecording }

View File

@@ -0,0 +1,152 @@
/**
* macOS platform backend for Computer Use.
*
* Delegates to @ant/computer-use-input (enigo keyboard/mouse) and
* @ant/computer-use-swift (screenshots, display, apps).
*
* No window-bound input (sendChar/sendKey/sendClick/sendText) — macOS
* uses global input via CoreGraphics events.
*/
import type { Platform } from './index.js'
import type {
InputPlatform,
ScreenshotPlatform,
DisplayPlatform,
AppsPlatform,
WindowHandle,
FrontmostAppInfo,
} from './types.js'
import { requireComputerUseInput } from '../inputLoader.js'
import { requireComputerUseSwift } from '../swiftLoader.js'
// ---------------------------------------------------------------------------
// Input — delegate to @ant/computer-use-input darwin backend
// ---------------------------------------------------------------------------
const input: InputPlatform = {
async moveMouse(x, y) {
const api = requireComputerUseInput()
await api.moveMouse(x, y)
},
async click(x, y, button) {
const api = requireComputerUseInput()
await api.moveMouse(x, y)
await api.mouseButton(button, 'click', 1)
},
async typeText(text) {
const api = requireComputerUseInput()
await api.typeText(text)
},
async key(name, action) {
const api = requireComputerUseInput()
await api.key(name, action)
},
async keys(combo) {
const api = requireComputerUseInput()
await api.keys(combo)
},
async scroll(amount, direction) {
const api = requireComputerUseInput()
await api.mouseScroll(amount, direction)
},
async mouseLocation() {
const api = requireComputerUseInput()
return api.mouseLocation()
},
// No window-bound methods on macOS
}
// ---------------------------------------------------------------------------
// Screenshot — delegate to @ant/computer-use-swift
// ---------------------------------------------------------------------------
const screenshot: ScreenshotPlatform = {
async captureScreen(displayId) {
const swift = requireComputerUseSwift()
return swift.screenshot.captureExcluding([], undefined, undefined, undefined, displayId)
},
async captureRegion(x, y, w, h) {
const swift = requireComputerUseSwift()
return swift.screenshot.captureRegion([], x, y, w, h)
},
// macOS could use SCContentFilter for window capture but we don't expose
// it through this interface yet — the swift module's captureExcluding
// handles most use cases.
}
// ---------------------------------------------------------------------------
// Display — delegate to @ant/computer-use-swift
// ---------------------------------------------------------------------------
const display: DisplayPlatform = {
listAll() {
const swift = requireComputerUseSwift()
return swift.display.listAll()
},
getSize(displayId) {
const swift = requireComputerUseSwift()
return swift.display.getSize(displayId)
},
}
// ---------------------------------------------------------------------------
// Apps — delegate to @ant/computer-use-swift
// ---------------------------------------------------------------------------
const apps: AppsPlatform = {
listRunning(): WindowHandle[] {
const swift = requireComputerUseSwift()
const running = swift.apps.listRunning()
return running.map((app: any) => ({
id: app.bundleId ?? '',
pid: 0, // macOS listRunning doesn't expose PID through this API
title: app.displayName ?? '',
}))
},
async listInstalled() {
const swift = requireComputerUseSwift()
const installed = await swift.apps.listInstalled()
return installed.map((app: any) => ({
id: app.bundleId ?? '',
displayName: app.displayName ?? '',
path: app.path ?? '',
}))
},
async open(name) {
const swift = requireComputerUseSwift()
await swift.apps.open(name)
},
getFrontmostApp(): FrontmostAppInfo | null {
const api = requireComputerUseInput()
const info = api.getFrontmostAppInfo()
if (!info) return null
return { id: info.bundleId, appName: info.appName }
},
findWindowByTitle(_title): WindowHandle | null {
// macOS: not directly supported through the current swift API.
// Use apps.listRunning() and filter by title instead.
const all = this.listRunning()
return all.find(w => w.title.includes(_title)) ?? null
},
}
// ---------------------------------------------------------------------------
// Export
// ---------------------------------------------------------------------------
export const platform: Platform = { input, screenshot, display, apps }

View File

@@ -0,0 +1,41 @@
/**
* Platform dispatcher for Computer Use.
*
* Loads the correct platform backend based on `process.platform`.
* Each backend implements the same unified interface.
*/
import type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js'
export interface Platform {
input: InputPlatform
screenshot: ScreenshotPlatform
display: DisplayPlatform
apps: AppsPlatform
windowManagement?: WindowManagementPlatform
}
let cached: Platform | undefined
export function loadPlatform(): Platform {
if (cached) return cached
switch (process.platform) {
case 'darwin':
cached = require('./darwin.js').platform
break
case 'win32':
cached = require('./win32.js').platform
break
case 'linux':
cached = require('./linux.js').platform
break
default:
throw new Error(`Computer Use not supported on ${process.platform}`)
}
return cached!
}
export type { InputPlatform, ScreenshotPlatform, DisplayPlatform, AppsPlatform, WindowManagementPlatform } from './types.js'
export type { WindowHandle, ScreenshotResult, DisplayInfo, InstalledApp, FrontmostAppInfo, WindowAction } from './types.js'

View File

@@ -0,0 +1,416 @@
/**
* Linux platform backend for Computer Use.
*
* Uses:
* - xdotool for mouse/keyboard input
* - scrot for screenshots (converted to JPEG)
* - xrandr for display enumeration
* - wmctrl for window management
*
* CRITICAL: All screenshots output JPEG. scrot outputs PNG by default,
* so we pipe through ImageMagick `convert` to produce JPEG.
*/
import type { Platform } from './index.js'
import type {
InputPlatform,
ScreenshotPlatform,
DisplayPlatform,
AppsPlatform,
WindowHandle,
ScreenshotResult,
DisplayInfo,
InstalledApp,
FrontmostAppInfo,
} from './types.js'
// ---------------------------------------------------------------------------
// Shell helpers
// ---------------------------------------------------------------------------
function run(cmd: string[]): string {
const result = Bun.spawnSync({ cmd, stdout: 'pipe', stderr: 'pipe' })
return new TextDecoder().decode(result.stdout).trim()
}
async function runAsync(cmd: string[]): Promise<string> {
const proc = Bun.spawn(cmd, { stdout: 'pipe', stderr: 'pipe' })
const out = await new Response(proc.stdout).text()
await proc.exited
return out.trim()
}
function commandExists(name: string): boolean {
const result = Bun.spawnSync({ cmd: ['which', name], stdout: 'pipe', stderr: 'pipe' })
return result.exitCode === 0
}
// ---------------------------------------------------------------------------
// xdotool key name mapping
// ---------------------------------------------------------------------------
const KEY_MAP: Record<string, string> = {
return: 'Return', enter: 'Return', tab: 'Tab', space: 'space',
backspace: 'BackSpace', delete: 'Delete', escape: 'Escape', esc: 'Escape',
left: 'Left', up: 'Up', right: 'Right', down: 'Down',
home: 'Home', end: 'End', pageup: 'Prior', pagedown: 'Next',
f1: 'F1', f2: 'F2', f3: 'F3', f4: 'F4', f5: 'F5', f6: 'F6',
f7: 'F7', f8: 'F8', f9: 'F9', f10: 'F10', f11: 'F11', f12: 'F12',
shift: 'shift', lshift: 'shift', rshift: 'shift',
control: 'ctrl', ctrl: 'ctrl', lcontrol: 'ctrl', rcontrol: 'ctrl',
alt: 'alt', option: 'alt', lalt: 'alt', ralt: 'alt',
win: 'super', meta: 'super', command: 'super', cmd: 'super', super: 'super',
insert: 'Insert', printscreen: 'Print', pause: 'Pause',
numlock: 'Num_Lock', capslock: 'Caps_Lock', scrolllock: 'Scroll_Lock',
}
const MODIFIER_KEYS = new Set([
'shift', 'lshift', 'rshift', 'control', 'ctrl', 'lcontrol', 'rcontrol',
'alt', 'option', 'lalt', 'ralt', 'win', 'meta', 'command', 'cmd', 'super',
])
function mapKey(name: string): string {
return KEY_MAP[name.toLowerCase()] ?? name
}
function mouseButtonNum(button: 'left' | 'right' | 'middle'): string {
return button === 'left' ? '1' : button === 'right' ? '3' : '2'
}
// ---------------------------------------------------------------------------
// Input — xdotool
// ---------------------------------------------------------------------------
const input: InputPlatform = {
async moveMouse(x, y) {
run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))])
},
async click(x, y, button) {
run(['xdotool', 'mousemove', '--sync', String(Math.round(x)), String(Math.round(y))])
run(['xdotool', 'click', mouseButtonNum(button)])
},
async typeText(text) {
run(['xdotool', 'type', '--delay', '12', text])
},
async key(name, action) {
const mapped = mapKey(name)
if (action === 'press') {
run(['xdotool', 'keydown', mapped])
} else {
run(['xdotool', 'keyup', mapped])
}
},
async keys(parts) {
const modifiers: string[] = []
let finalKey: string | null = null
for (const part of parts) {
if (MODIFIER_KEYS.has(part.toLowerCase())) {
modifiers.push(mapKey(part))
} else {
finalKey = part
}
}
if (!finalKey) return
const combo = [...modifiers, mapKey(finalKey)].join('+')
run(['xdotool', 'key', combo])
},
async scroll(amount, direction) {
if (direction === 'vertical') {
const btn = amount >= 0 ? '5' : '4'
const repeats = Math.abs(Math.round(amount))
if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn])
} else {
const btn = amount >= 0 ? '7' : '6'
const repeats = Math.abs(Math.round(amount))
if (repeats > 0) run(['xdotool', 'click', '--repeat', String(repeats), btn])
}
},
async mouseLocation() {
const out = run(['xdotool', 'getmouselocation'])
const xMatch = out.match(/x:(\d+)/)
const yMatch = out.match(/y:(\d+)/)
return {
x: xMatch ? Number(xMatch[1]) : 0,
y: yMatch ? Number(yMatch[1]) : 0,
}
},
// No window-bound input on Linux
}
// ---------------------------------------------------------------------------
// Screenshot — scrot → JPEG conversion
// ---------------------------------------------------------------------------
const SCREENSHOT_TMP = '/tmp/cu-screenshot-tmp.png'
const SCREENSHOT_JPG = '/tmp/cu-screenshot.jpg'
async function pngToJpegBase64(pngPath: string, width: number, height: number): Promise<ScreenshotResult> {
// Try ImageMagick convert first
if (commandExists('convert')) {
await runAsync(['convert', pngPath, '-quality', '75', SCREENSHOT_JPG])
const file = Bun.file(SCREENSHOT_JPG)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
// Fallback: ffmpeg
if (commandExists('ffmpeg')) {
await runAsync(['ffmpeg', '-y', '-i', pngPath, '-q:v', '5', SCREENSHOT_JPG])
const file = Bun.file(SCREENSHOT_JPG)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
// Last resort: return PNG base64 (caller should be aware)
const file = Bun.file(pngPath)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
const screenshot: ScreenshotPlatform = {
async captureScreen(displayId) {
try {
await runAsync(['scrot', '-o', SCREENSHOT_TMP])
const size = display.getSize(displayId)
return pngToJpegBase64(SCREENSHOT_TMP, size.width, size.height)
} catch {
return { base64: '', width: 0, height: 0 }
}
},
async captureRegion(x, y, w, h) {
try {
await runAsync(['scrot', '-a', `${x},${y},${w},${h}`, '-o', SCREENSHOT_TMP])
return pngToJpegBase64(SCREENSHOT_TMP, w, h)
} catch {
return { base64: '', width: w, height: h }
}
},
async captureWindow(hwnd) {
try {
// Use xdotool to get window geometry, then import (ImageMagick) to capture
if (commandExists('import')) {
const jpgPath = '/tmp/cu-window-capture.jpg'
await runAsync(['import', '-window', hwnd, '-quality', '75', jpgPath])
// Get dimensions from xdotool
const geom = run(['xdotool', 'getwindowgeometry', '--shell', hwnd])
const wMatch = geom.match(/WIDTH=(\d+)/)
const hMatch = geom.match(/HEIGHT=(\d+)/)
const width = wMatch ? Number(wMatch[1]) : 0
const height = hMatch ? Number(hMatch[1]) : 0
const file = Bun.file(jpgPath)
const buffer = await file.arrayBuffer()
return { base64: Buffer.from(buffer).toString('base64'), width, height }
}
return null
} catch {
return null
}
},
}
// ---------------------------------------------------------------------------
// Display — xrandr
// ---------------------------------------------------------------------------
const display: DisplayPlatform = {
listAll(): DisplayInfo[] {
try {
const raw = run(['xrandr', '--query'])
const displays: DisplayInfo[] = []
let idx = 0
const regex = /^\S+\s+connected\s+(?:primary\s+)?(\d+)x(\d+)\+\d+\+\d+/gm
let match: RegExpExecArray | null
while ((match = regex.exec(raw)) !== null) {
displays.push({
width: Number(match[1]),
height: Number(match[2]),
scaleFactor: 1,
displayId: idx++,
})
}
if (displays.length === 0) {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
return displays
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
},
getSize(displayId): DisplayInfo {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
},
}
// ---------------------------------------------------------------------------
// Apps — wmctrl + ps + .desktop files
// ---------------------------------------------------------------------------
const apps: AppsPlatform = {
listRunning(): WindowHandle[] {
try {
if (commandExists('wmctrl')) {
const raw = run(['wmctrl', '-l', '-p'])
const handles: WindowHandle[] = []
for (const line of raw.split('\n').filter(Boolean)) {
const parts = line.split(/\s+/)
const windowId = parts[0]
const pid = Number(parts[2])
if (!pid) continue
// Title is everything after the 4th field (hostname)
const title = parts.slice(4).join(' ')
let exePath = ''
try { exePath = run(['readlink', '-f', `/proc/${pid}/exe`]) } catch {}
handles.push({
id: windowId ?? '',
pid,
title,
exePath: exePath || undefined,
})
}
// Deduplicate by id
const seen = new Set<string>()
return handles.filter(h => {
if (seen.has(h.id)) return false
seen.add(h.id)
return true
}).slice(0, 50)
}
// Fallback: xdotool search
const raw = run(['xdotool', 'search', '--name', ''])
const handles: WindowHandle[] = []
for (const windowId of raw.split('\n').filter(Boolean).slice(0, 50)) {
const title = run(['xdotool', 'getwindowname', windowId])
let pid = 0
try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {}
if (title) {
handles.push({ id: windowId, pid, title })
}
}
return handles
} catch {
return []
}
},
async listInstalled(): Promise<InstalledApp[]> {
try {
const dirs = [
'/usr/share/applications',
'/usr/local/share/applications',
`${process.env.HOME}/.local/share/applications`,
]
const result: InstalledApp[] = []
for (const dir of dirs) {
let files: string
try {
files = run(['find', dir, '-name', '*.desktop', '-maxdepth', '1'])
} catch { continue }
for (const filepath of files.split('\n').filter(Boolean)) {
try {
const content = run(['cat', filepath])
const nameMatch = content.match(/^Name=(.+)$/m)
const execMatch = content.match(/^Exec=(.+)$/m)
const noDisplay = content.match(/^NoDisplay=true$/m)
if (noDisplay) continue
const name = nameMatch?.[1] ?? ''
const exec = execMatch?.[1] ?? ''
if (!name) continue
result.push({
id: filepath.split('/').pop()?.replace('.desktop', '') ?? '',
displayName: name,
path: exec.split(/\s+/)[0] ?? '',
})
} catch { /* skip unreadable */ }
}
}
return result.slice(0, 200)
} catch {
return []
}
},
async open(name) {
try {
const desktopName = name.endsWith('.desktop') ? name : `${name}.desktop`
if (commandExists('gtk-launch')) {
await runAsync(['gtk-launch', desktopName])
return
}
} catch { /* fall through */ }
await runAsync(['xdg-open', name])
},
getFrontmostApp(): FrontmostAppInfo | null {
try {
const windowId = run(['xdotool', 'getactivewindow'])
if (!windowId) return null
const pidStr = run(['xdotool', 'getwindowpid', windowId])
if (!pidStr) return null
let exePath = ''
try { exePath = run(['readlink', '-f', `/proc/${pidStr}/exe`]) } catch {}
let appName = ''
try { appName = run(['cat', `/proc/${pidStr}/comm`]) } catch {}
if (!exePath && !appName) return null
return { id: exePath || `/proc/${pidStr}/exe`, appName: appName || 'unknown' }
} catch {
return null
}
},
findWindowByTitle(title): WindowHandle | null {
try {
// xdotool search by name
const raw = run(['xdotool', 'search', '--name', title])
const windowId = raw.split('\n')[0]
if (!windowId) return null
const windowTitle = run(['xdotool', 'getwindowname', windowId])
let pid = 0
try { pid = Number(run(['xdotool', 'getwindowpid', windowId])) } catch {}
return { id: windowId, pid, title: windowTitle }
} catch {
return null
}
},
}
// ---------------------------------------------------------------------------
// Export
// ---------------------------------------------------------------------------
export const platform: Platform = { input, screenshot, display, apps }

View File

@@ -0,0 +1,153 @@
/**
* Cross-platform abstraction types for Computer Use.
*
* These interfaces define a unified API surface for input, screenshots,
* display info, and app management across macOS, Windows, and Linux.
*/
// ---------------------------------------------------------------------------
// Window / App types
// ---------------------------------------------------------------------------
/** Cross-platform window identifier */
export interface WindowHandle {
id: string // macOS: bundleId, Windows: HWND string, Linux: window ID
pid: number
title: string
exePath?: string // Windows/Linux: process executable path
}
export interface ScreenshotResult {
base64: string
width: number
height: number
}
export interface DisplayInfo {
width: number
height: number
scaleFactor: number
displayId: number
}
export interface InstalledApp {
id: string // macOS: bundleId, Windows: exe path or package family, Linux: .desktop name
displayName: string
path: string
}
export interface FrontmostAppInfo {
id: string
appName: string
}
// ---------------------------------------------------------------------------
// InputPlatform
// ---------------------------------------------------------------------------
/**
* Input platform interface — two modes:
*
* Mode A (Global): moveMouse, click, typeText, key, keys, scroll, mouseLocation
* Works on all platforms. Sends input to the foreground window; moves the
* real cursor and steals focus.
*
* Mode B (Window-bound, optional): sendChar, sendKey, sendClick, sendText
* Windows-only via SendMessage/PostMessage. Does NOT steal focus or move
* the cursor. Preferred when a target HWND is known.
*/
export interface InputPlatform {
// --- Mode A: Global input (all platforms) ---
moveMouse(x: number, y: number): Promise<void>
click(
x: number,
y: number,
button: 'left' | 'right' | 'middle',
): Promise<void>
typeText(text: string): Promise<void>
key(name: string, action: 'press' | 'release'): Promise<void>
keys(combo: string[]): Promise<void>
scroll(amount: number, direction: 'vertical' | 'horizontal'): Promise<void>
mouseLocation(): Promise<{ x: number; y: number }>
// --- Mode B: Window-bound input (Windows only, optional) ---
sendChar?(hwnd: string, char: string): Promise<void>
sendKey?(hwnd: string, vk: number, action: 'down' | 'up'): Promise<void>
sendClick?(
hwnd: string,
x: number,
y: number,
button: 'left' | 'right',
): Promise<void>
sendText?(hwnd: string, text: string): Promise<void>
}
// ---------------------------------------------------------------------------
// ScreenshotPlatform
// ---------------------------------------------------------------------------
export interface ScreenshotPlatform {
/** Full-screen capture. Returns JPEG base64. */
captureScreen(displayId?: number): Promise<ScreenshotResult>
/** Region capture. Returns JPEG base64. */
captureRegion(
x: number,
y: number,
w: number,
h: number,
): Promise<ScreenshotResult>
/** Window capture (Windows: PrintWindow, macOS: SCContentFilter, Linux: xdotool+import). */
captureWindow?(hwnd: string): Promise<ScreenshotResult | null>
}
// ---------------------------------------------------------------------------
// DisplayPlatform
// ---------------------------------------------------------------------------
export interface DisplayPlatform {
listAll(): DisplayInfo[]
getSize(displayId?: number): DisplayInfo
}
// ---------------------------------------------------------------------------
// AppsPlatform
// ---------------------------------------------------------------------------
export interface AppsPlatform {
listRunning(): WindowHandle[]
listInstalled(): Promise<InstalledApp[]>
open(name: string): Promise<void>
getFrontmostApp(): FrontmostAppInfo | null
findWindowByTitle(title: string): WindowHandle | null
}
// ---------------------------------------------------------------------------
// WindowManagementPlatform (Windows HWND-targeted, no global APIs)
// ---------------------------------------------------------------------------
export type WindowAction =
| 'minimize'
| 'maximize'
| 'restore'
| 'close'
| 'focus'
| 'move_offscreen'
| 'move_resize'
| 'get_rect'
export interface WindowManagementPlatform {
/** Perform a window management action on the bound HWND. All via Win32 API, no global shortcuts. */
manageWindow(
action: WindowAction,
opts?: { x?: number; y?: number; width?: number; height?: number },
): boolean
/** Move window to specific position and/or resize */
moveResize(x: number, y: number, width?: number, height?: number): boolean
/** Get current window rect */
getWindowRect(): {
x: number
y: number
width: number
height: number
} | null
}

View File

@@ -0,0 +1,979 @@
/**
* Windows platform backend for Computer Use.
*
* Combines:
* - PowerShell SetCursorPos/SendInput for global input (fallback)
* - win32/windowMessage.ts for window-bound SendMessage input (preferred)
* - Python Bridge (bridge.py) for screenshots (mss + ctypes PrintWindow)
* - win32/windowEnum.ts for EnumWindows app listing
* - No PowerShell for screenshots (Python Bridge only, no PS fallback)
* - PowerShell Screen.AllScreens for display enumeration
*
* CRITICAL: All screenshots output JPEG (ImageFormat::Jpeg), not PNG.
*/
import type { Platform } from './index.js'
import type {
InputPlatform,
ScreenshotPlatform,
DisplayPlatform,
AppsPlatform,
WindowHandle,
ScreenshotResult,
DisplayInfo,
InstalledApp,
FrontmostAppInfo,
} from './types.js'
import { listWindows } from '../win32/windowEnum.js'
import { detectAppType, openWithController } from '../win32/appDispatcher.js'
import {
markBound,
unmarkBound,
cleanupAllBorders,
} from '../win32/windowBorder.js'
import {
showVirtualCursor,
hideVirtualCursor,
moveVirtualCursor,
} from '../win32/virtualCursor.js'
import { showIndicator, hideIndicator } from '../win32/inputIndicator.js'
import {
ps,
psAsync,
validateHwnd,
VK_MAP,
MODIFIER_KEYS,
} from '../win32/shared.js'
import { logForDebugging } from '../../debug.js'
// ---------------------------------------------------------------------------
// Python Bridge (lazy-loaded, preferred over PowerShell for screenshots)
// ---------------------------------------------------------------------------
let _bridge: typeof import('../win32/bridgeClient.js') | undefined
function getBridge() {
if (!_bridge) {
try {
_bridge =
require('../win32/bridgeClient.js') as typeof import('../win32/bridgeClient.js')
} catch {}
}
return _bridge
}
/** Try a bridge call, return null on failure (caller falls back to PS) */
function bridgeCallSync<T>(
method: string,
params: Record<string, unknown> = {},
): T | null {
try {
const b = getBridge()
if (!b) return null
return b.callSync<T>(method, params)
} catch {
return null
}
}
// validateHwnd, ps, psAsync, VK_MAP, MODIFIER_KEYS imported from '../win32/shared.js'
// ---------------------------------------------------------------------------
// Win32 P/Invoke types (compiled once per PS session)
// ---------------------------------------------------------------------------
const WIN32_TYPES = `
Add-Type -Language CSharp @'
using System;
using System.Runtime.InteropServices;
using System.Text;
using System.Diagnostics;
public class CuWin32 {
// --- Cursor ---
[DllImport("user32.dll")] public static extern bool SetCursorPos(int X, int Y);
[DllImport("user32.dll")] public static extern bool GetCursorPos(out POINT p);
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X; public int Y; }
// --- SendInput ---
[StructLayout(LayoutKind.Sequential)] public struct MOUSEINPUT {
public int dx; public int dy; public int mouseData; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
}
[StructLayout(LayoutKind.Explicit)] public struct INPUT {
[FieldOffset(0)] public uint type;
[FieldOffset(4)] public MOUSEINPUT mi;
}
[StructLayout(LayoutKind.Sequential)] public struct KEYBDINPUT {
public ushort wVk; public ushort wScan; public uint dwFlags; public uint time; public IntPtr dwExtraInfo;
}
[StructLayout(LayoutKind.Explicit)] public struct KINPUT {
[FieldOffset(0)] public uint type;
[FieldOffset(4)] public KEYBDINPUT ki;
}
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, INPUT[] i, int cb);
[DllImport("user32.dll", SetLastError=true)] public static extern uint SendInput(uint n, KINPUT[] i, int cb);
// --- Keyboard ---
[DllImport("user32.dll")] public static extern void keybd_event(byte bVk, byte bScan, uint dwFlags, UIntPtr dwExtraInfo);
[DllImport("user32.dll")] public static extern short VkKeyScan(char ch);
// --- Window ---
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint pid);
[DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr hWnd, StringBuilder sb, int max);
// Constants
public const uint INPUT_MOUSE = 0, INPUT_KEYBOARD = 1;
public const uint MOUSEEVENTF_LEFTDOWN = 0x0002, MOUSEEVENTF_LEFTUP = 0x0004;
public const uint MOUSEEVENTF_RIGHTDOWN = 0x0008, MOUSEEVENTF_RIGHTUP = 0x0010;
public const uint MOUSEEVENTF_MIDDLEDOWN = 0x0020, MOUSEEVENTF_MIDDLEUP = 0x0040;
public const uint MOUSEEVENTF_WHEEL = 0x0800, MOUSEEVENTF_HWHEEL = 0x1000;
public const uint KEYEVENTF_KEYUP = 0x0002;
}
'@
`
// VK_MAP and MODIFIER_KEYS imported from '../win32/shared.js'
// ---------------------------------------------------------------------------
// Session-level HWND binding — all operations target this handle
// ---------------------------------------------------------------------------
let boundHwnd: string | null = null
let boundPid: number | null = null
let boundAppType: import('../win32/appDispatcher.js').AppType | null = null
let boundFilePath: string | null = null
/** Get the bound HWND, or null if not bound */
export function getBoundHwnd(): string | null {
return boundHwnd
}
/** Get the bound app type */
export function getBoundAppType(): string | null {
return boundAppType
}
/** Bind to a window HWND — all subsequent input/screenshot operations target this handle */
export function bindWindow(hwnd: string, pid?: number): void {
hwnd = validateHwnd(hwnd)
// Clean up previous binding
if (boundHwnd) {
unmarkBound(boundHwnd)
hideVirtualCursor()
hideIndicator()
}
boundHwnd = hwnd
boundPid = pid ?? null
boundAppType = 'generic'
boundFilePath = null
// 1. Brief activation: set the window to accept input, then restore user's focus.
// Some apps (UWP/Electron) don't process SendMessage when never-activated.
// Save current foreground → activate target → restore original foreground.
const activateScript = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuActivate {
[DllImport("user32.dll")] public static extern IntPtr GetForegroundWindow();
[DllImport("user32.dll")] public static extern bool SetForegroundWindow(IntPtr h);
[DllImport("user32.dll")] public static extern bool IsIconic(IntPtr h);
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd);
}
'@
$prev = [CuActivate]::GetForegroundWindow()
$target = [IntPtr]::new([long]${hwnd})
if ([CuActivate]::IsIconic($target)) { [CuActivate]::ShowWindow($target, 9) | Out-Null }
[CuActivate]::SetForegroundWindow($target) | Out-Null
Start-Sleep -Milliseconds 100
if ($prev -ne [IntPtr]::Zero -and $prev -ne $target) {
[CuActivate]::SetForegroundWindow($prev) | Out-Null
}
`
ps(activateScript)
// 2. Visual indicators
markBound(hwnd)
showVirtualCursor(hwnd)
showIndicator(hwnd)
}
/** Bind to a COM-controlled file (Excel/Word — no window needed) */
export function bindFile(
filePath: string,
appType: import('../win32/appDispatcher.js').AppType,
): void {
boundHwnd = null
boundPid = null
boundAppType = appType
boundFilePath = filePath
}
/** Unbind — revert to global mode, remove overlays */
export function unbindWindow(): void {
if (boundHwnd) unmarkBound(boundHwnd)
hideVirtualCursor()
hideIndicator()
// Clear cached edit-child / InputSite mappings
getWm().clearEditChildCache()
boundHwnd = null
boundPid = null
boundAppType = null
boundFilePath = null
}
// ---------------------------------------------------------------------------
// Window Message module (lazy loaded)
// ---------------------------------------------------------------------------
let _wm: typeof import('../win32/windowMessage.js') | undefined
function getWm() {
// eslint-disable-next-line @typescript-eslint/no-require-imports
return (_wm ??=
require('../win32/windowMessage.js') as typeof import('../win32/windowMessage.js'))
}
// ---------------------------------------------------------------------------
// Input — ALL text/key input goes through SendMessage when HWND is bound.
// Global SendInput/keybd_event is DISABLED to avoid interfering with user.
// ---------------------------------------------------------------------------
// ---------------------------------------------------------------------------
// Input — When HWND is bound, ALL operations go through SendMessage.
// NO global API (SetCursorPos/SendInput/keybd_event/SendKeys) is used.
// This ensures the user's desktop is never disturbed.
// ---------------------------------------------------------------------------
const input: InputPlatform = {
async moveMouse(x, y) {
if (boundHwnd) {
// Bound mode: move virtual cursor (visual only), no real cursor movement
moveVirtualCursor(Math.round(x), Math.round(y))
return
}
ps(
`${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null`,
)
},
async click(x, y, button) {
if (boundHwnd) {
moveVirtualCursor(Math.round(x), Math.round(y), true)
// Find the deepest child window at these client coords and click on it.
const editHwnd = getWm().findEditChild(boundHwnd)
const targetHwnd = editHwnd ?? boundHwnd
const ok = getWm().sendClick(
targetHwnd,
Math.round(x),
Math.round(y),
button,
)
if (!ok) {
getWm().sendClick(boundHwnd, Math.round(x), Math.round(y), button)
}
return
}
const downFlag =
button === 'left'
? 'MOUSEEVENTF_LEFTDOWN'
: button === 'right'
? 'MOUSEEVENTF_RIGHTDOWN'
: 'MOUSEEVENTF_MIDDLEDOWN'
const upFlag =
button === 'left'
? 'MOUSEEVENTF_LEFTUP'
: button === 'right'
? 'MOUSEEVENTF_RIGHTUP'
: 'MOUSEEVENTF_MIDDLEUP'
ps(
`${WIN32_TYPES}; [CuWin32]::SetCursorPos(${Math.round(x)}, ${Math.round(y)}) | Out-Null; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${downFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null; $i.mi.dwFlags=[CuWin32]::${upFlag}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`,
)
},
async typeText(text) {
// COM-controlled apps: write directly via COM API
if (boundAppType === 'word' && boundFilePath) {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { appendText } =
require('../win32/comWord.js') as typeof import('../win32/comWord.js')
appendText(boundFilePath, text)
return
}
// HWND-bound apps: SendMessageW(WM_CHAR) or clipboard paste
if (boundHwnd) {
const ok = getWm().sendText(boundHwnd, text)
if (!ok) {
throw new Error(
`typeText failed: SendMessage to HWND ${boundHwnd} returned false. ` +
`The edit control may not have been found (findEditChild returned null).`,
)
}
return
}
throw new Error(
'typeText requires a bound window or file. Call open() first.',
)
},
async key(name, action) {
if (boundHwnd) {
const lower = name.toLowerCase()
const vk = VK_MAP[lower] ?? (name.length === 1 ? name.charCodeAt(0) : 0)
if (vk)
getWm().sendKey(boundHwnd, vk, action === 'release' ? 'up' : 'down')
return
}
throw new Error('key requires a bound window HWND. Call open() first.')
},
async keys(parts) {
if (boundHwnd) {
const ok = getWm().sendKeys(boundHwnd, parts)
if (!ok) {
throw new Error(`keys [${parts.join('+')}] failed on HWND ${boundHwnd}`)
}
return
}
throw new Error('keys requires a bound window HWND. Call open() first.')
},
async scroll(amount, direction) {
if (boundHwnd) {
// WM_VSCROLL / WM_HSCROLL for window-bound scrolling
const msg = direction === 'vertical' ? '0x0115' : '0x0114' // WM_VSCROLL / WM_HSCROLL
const wParam = amount > 0 ? '1' : '0' // SB_LINEDOWN=1 (positive=down) / SB_LINEUP=0 (negative=up)
const n = Math.abs(Math.round(amount))
let script = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WScroll {
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMessage(IntPtr h, uint m, IntPtr w, IntPtr l);
}
'@
`
for (let i = 0; i < n; i++) {
script += `[WScroll]::SendMessage([IntPtr]::new([long]${boundHwnd}), ${msg}, [IntPtr]${wParam}, [IntPtr]::Zero) | Out-Null; `
}
ps(script)
return
}
const flag =
direction === 'vertical' ? 'MOUSEEVENTF_WHEEL' : 'MOUSEEVENTF_HWHEEL'
ps(
`${WIN32_TYPES}; $i = New-Object CuWin32+INPUT; $i.type=[CuWin32]::INPUT_MOUSE; $i.mi.dwFlags=[CuWin32]::${flag}; $i.mi.mouseData=${amount * 120}; [CuWin32]::SendInput(1, @($i), [Runtime.InteropServices.Marshal]::SizeOf($i)) | Out-Null`,
)
},
async mouseLocation() {
// Always returns real cursor position (informational, doesn't move it)
const out = ps(
`${WIN32_TYPES}; $p = New-Object CuWin32+POINT; [CuWin32]::GetCursorPos([ref]$p) | Out-Null; "$($p.X),$($p.Y)"`,
)
const [xStr, yStr] = out.split(',')
return { x: Number(xStr), y: Number(yStr) }
},
async sendChar(hwnd, char) {
getWm().sendChar(String(hwnd), char)
},
async sendKey(hwnd, vk, action) {
getWm().sendKey(String(hwnd), vk, action)
},
async sendClick(hwnd, x, y, button) {
getWm().sendClick(String(hwnd), x, y, button)
},
async sendText(hwnd, text) {
getWm().sendText(String(hwnd), text)
},
}
// ---------------------------------------------------------------------------
// Screenshot — JPEG output only
// ---------------------------------------------------------------------------
const screenshot: ScreenshotPlatform = {
async captureScreen(displayId) {
// If HWND is bound, capture that specific window
if (boundHwnd) {
const result = this.captureWindow?.(String(boundHwnd))
if (result) return result
}
// Python Bridge (mss + Pillow, ~300ms)
const bridgeResult = bridgeCallSync<ScreenshotResult>('screenshot', {
display_id: displayId ?? 0,
})
if (bridgeResult && bridgeResult.base64) {
return bridgeResult
}
throw new Error(
'[computer-use] Screenshot failed: Python bridge returned no data. ' +
'Ensure python3 + mss + Pillow are installed (pip install mss Pillow).',
)
},
async captureRegion(x, y, w, h) {
// When HWND is bound, the window IS the region (matches macOS behavior)
if (boundHwnd) {
const result = this.captureWindow?.(String(boundHwnd))
if (result) return result
}
return this.captureScreen()
},
captureWindow(hwnd) {
// Python Bridge (ctypes PrintWindow + GDI → Pillow JPEG, ~300ms)
const bridgeResult = bridgeCallSync<ScreenshotResult>('screenshot_window', {
hwnd: String(hwnd),
})
if (bridgeResult && bridgeResult.base64) {
return bridgeResult
}
throw new Error(
`[computer-use] Window screenshot failed for HWND ${hwnd}: Python bridge returned no data.`,
)
},
}
// ---------------------------------------------------------------------------
// Display — Screen.AllScreens
// ---------------------------------------------------------------------------
const display: DisplayPlatform = {
listAll(): DisplayInfo[] {
try {
const raw = ps(`
Add-Type -AssemblyName System.Windows.Forms
$result = @()
$idx = 0
foreach ($s in [System.Windows.Forms.Screen]::AllScreens) {
$result += "$($s.Bounds.Width),$($s.Bounds.Height),$idx,$($s.Primary)"
$idx++
}
$result -join "|"
`)
return raw
.split('|')
.filter(Boolean)
.map(entry => {
const [w, h, id] = entry.split(',')
return {
width: Number(w),
height: Number(h),
scaleFactor: 1,
displayId: Number(id),
}
})
} catch {
return [{ width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }]
}
},
getSize(displayId): DisplayInfo {
const all = this.listAll()
if (displayId !== undefined) {
const found = all.find(d => d.displayId === displayId)
if (found) return found
}
return all[0] ?? { width: 1920, height: 1080, scaleFactor: 1, displayId: 0 }
},
}
// ---------------------------------------------------------------------------
// Find existing window by process name or title (avoid launching new instance)
// ---------------------------------------------------------------------------
function findExistingWindow(
hint: string,
): { hwnd: string; pid: number } | null {
const windows = listWindows()
const lower = hint.toLowerCase()
// Match by window title containing the hint
for (const w of windows) {
const titleLower = (w.title ?? '').toLowerCase()
if (titleLower.includes(lower)) {
return { hwnd: w.hwnd, pid: w.pid }
}
}
return null
}
// ---------------------------------------------------------------------------
// Apps — EnumWindows + registry + AppxPackage
// ---------------------------------------------------------------------------
const apps: AppsPlatform = {
listRunning(): WindowHandle[] {
const windows = listWindows()
return windows.map(w => ({
id: String(w.hwnd),
pid: w.pid,
title: w.title,
}))
},
async listInstalled(): Promise<InstalledApp[]> {
try {
const raw = await psAsync(`
$apps = @()
# Traditional Win32 apps from registry
$paths = @(
'HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
'HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*',
'HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*'
)
foreach ($p in $paths) {
Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object { $_.DisplayName } | ForEach-Object {
$apps += "$($_.DisplayName)|$($_.InstallLocation)|$($_.PSChildName)"
}
}
# UWP/MSIX apps (Windows 10/11 Store apps)
Get-AppxPackage -ErrorAction SilentlyContinue | Where-Object { $_.IsFramework -eq $false -and $_.SignatureKind -eq 'Store' } | ForEach-Object {
$cleanName = $_.Name -replace '^Microsoft\\.Windows', '' -replace '^Microsoft\\.', ''
$apps += "$cleanName|$($_.InstallLocation)|$($_.PackageFamilyName)"
}
$apps | Select-Object -Unique | Select-Object -First 300
`)
return raw
.split('\n')
.filter(Boolean)
.map(line => {
const [name, path, id] = line.trim().split('|', 3)
return {
id: (id ?? name ?? '').trim(),
displayName: (name ?? '').trim(),
path: (path ?? '').trim(),
}
})
} catch {
return []
}
},
async open(name) {
// Detect app type and route to appropriate controller
const appType = detectAppType(name)
// Excel/Word → COM automation (no window, no HWND)
if (appType === 'excel' || appType === 'word') {
const result = await openWithController(name)
if (result.filePath) {
bindFile(result.filePath, result.type)
}
return
}
// Text/Browser/Generic → exe launch + HWND bind (offscreen)
// If name is a UWP PackageFamilyName (e.g. Microsoft.WindowsNotepad_8wekyb3d8bbwe),
// extract the app name and try as exe. This avoids launching through UWP shell.
let launchName = name
if (name.includes('_') && name.includes('.')) {
// Microsoft.WindowsNotepad_xxx → Notepad
// Microsoft.WindowsCalculator_xxx → Calculator
// Microsoft.WindowsTerminal_xxx → Terminal
const parts = name.split('_')[0]?.split('.') ?? []
const appPart = parts[parts.length - 1] ?? name
// Strip "Windows" prefix: WindowsNotepad → Notepad
launchName = appPart.replace(/^Windows/, '') || appPart
}
// --- Try to find an EXISTING window first (by process name or title) ---
// If found, auto-bind to it. Use bind_window tool to switch later.
const existingHwnd = findExistingWindow(launchName)
if (existingHwnd) {
bindWindow(existingHwnd.hwnd, existingHwnd.pid)
return
}
const escaped = launchName.replace(/'/g, "''")
const result = await psAsync(`
${WIN32_TYPES}
Add-Type @'
using System;
using System.Runtime.InteropServices;
using System.Text;
public class CuLaunch {
public delegate bool EnumProc(IntPtr h, IntPtr lp);
[DllImport("user32.dll")] public static extern bool EnumWindows(EnumProc cb, IntPtr lp);
[DllImport("user32.dll")] public static extern bool IsWindowVisible(IntPtr h);
[DllImport("user32.dll")] public static extern uint GetWindowThreadProcessId(IntPtr h, out uint pid);
[DllImport("user32.dll", CharSet=CharSet.Unicode)] public static extern int GetWindowText(IntPtr h, StringBuilder sb, int n);
[DllImport("user32.dll")] public static extern bool ShowWindow(IntPtr h, int cmd);
public const int SW_SHOWMINNOACTIVE = 7;
// Get all visible window HWNDs as array
public static long[] GetAllVisibleHwnds() {
var list = new System.Collections.Generic.List<long>();
EnumWindows((h, _) => {
if (IsWindowVisible(h)) list.Add(h.ToInt64());
return true;
}, IntPtr.Zero);
return list.ToArray();
}
// Get PID for a single HWND
public static uint GetPidForHwnd(long hwnd) {
uint pid; GetWindowThreadProcessId((IntPtr)hwnd, out pid);
return pid;
}
// Get title for a single HWND
public static string GetTitle(long hwnd) {
var sb = new StringBuilder(256);
GetWindowText((IntPtr)hwnd, sb, 256);
return sb.ToString();
}
}
'@
# Launch strategy: all exe-based, no GUI dialogs.
# 1) exact path 2) exe in PATH 3) registry install dir 4) raw name
$target = '${escaped}'
$proc = $null
# 1. Exact file path
if (Test-Path $target) {
$proc = Start-Process $target -PassThru -ErrorAction SilentlyContinue
}
# 2. exe name in PATH (notepad.exe, code.exe, chrome.exe, etc.)
if (-not $proc) {
# Try with .exe suffix if not already
$tryExe = if ($target -notmatch '[.]exe$') { "$target.exe" } else { $target }
$found = Get-Command $tryExe -ErrorAction SilentlyContinue | Select-Object -First 1
if (-not $found) { $found = Get-Command $target -ErrorAction SilentlyContinue | Select-Object -First 1 }
if ($found) { $proc = Start-Process $found.Source -PassThru -ErrorAction SilentlyContinue }
}
# 3. Search registry for install location by display name → find .exe
if (-not $proc) {
$regPaths = @('HKLM:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKLM:\\SOFTWARE\\WOW6432Node\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*','HKCU:\\SOFTWARE\\Microsoft\\Windows\\CurrentVersion\\Uninstall\\*')
foreach ($p in $regPaths) {
$app = Get-ItemProperty $p -ErrorAction SilentlyContinue | Where-Object {
$_.DisplayName -and $_.DisplayName -match [regex]::Escape($target)
} | Select-Object -First 1
if ($app) {
# Try DisplayIcon (often the exe path), then InstallLocation
$exePath = $null
if ($app.DisplayIcon -and $app.DisplayIcon -match '[.]exe') {
$exePath = ($app.DisplayIcon -split ',')[0].Trim('"')
}
if (-not $exePath -and $app.InstallLocation) {
$exeFile = Get-ChildItem $app.InstallLocation -Filter '*.exe' -ErrorAction SilentlyContinue | Select-Object -First 1
if ($exeFile) { $exePath = $exeFile.FullName }
}
if ($exePath -and (Test-Path $exePath)) {
$proc = Start-Process $exePath -PassThru -ErrorAction SilentlyContinue
break
}
}
}
}
# 4. Last resort: direct Start-Process (Windows may resolve it)
if (-not $proc) { $proc = Start-Process -FilePath $target -PassThru -ErrorAction SilentlyContinue }
if (-not $proc) { Write-Host "LAUNCH_FAILED"; exit }
# Snapshot ALL visible window HWNDs before the new window appears
$beforeHwnds = [CuLaunch]::GetAllVisibleHwnds()
# Wait for a NEW window from our process PID
$hwnd = 0
for ($i = 0; $i -lt 50; $i++) {
Start-Sleep -Milliseconds 200
$afterHwnds = [CuLaunch]::GetAllVisibleHwnds()
# Find new windows (in after but not in before)
foreach ($h in $afterHwnds) {
if ($beforeHwnds -contains $h) { continue }
# New window found — check PID
$wPid = [CuLaunch]::GetPidForHwnd($h)
if ($wPid -eq [uint32]$proc.Id) {
$hwnd = $h; break # exact PID match
}
}
if ($hwnd -ne 0) { break }
# PID didn't match (process redirect) — accept new window matching title hint
if ($i -gt 10) {
$hint = '${escaped}'.Split('\\')[-1].Replace('.exe','')
foreach ($h in $afterHwnds) {
if ($beforeHwnds -contains $h) { continue }
$title = [CuLaunch]::GetTitle($h)
if ($title -and $title.IndexOf($hint, [StringComparison]::OrdinalIgnoreCase) -ge 0) {
$hwnd = $h; break
}
}
if ($hwnd -ne 0) { break }
}
}
if ($hwnd -eq 0) { Write-Host "HWND_NOT_FOUND|$($proc.Id)"; exit }
# Move offscreen instead of minimizing — keeps window restored so
# PrintWindow and SendMessage work without needing restore/re-minimize.
# User cannot see the window at -32000,-32000.
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuPos {
[DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f);
public const uint SWP_NOSIZE = 0x0001;
public const uint SWP_NOZORDER = 0x0004;
public const uint SWP_NOACTIVATE = 0x0010;
}
'@
[CuPos]::SetWindowPos([IntPtr]::new([long]$hwnd), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuPos]::SWP_NOSIZE -bor [CuPos]::SWP_NOZORDER -bor [CuPos]::SWP_NOACTIVATE) | Out-Null
Write-Host "$hwnd|$($proc.Id)"
`)
if (!result) {
throw new Error(
`open(): failed to launch '${name}' — no output from launcher script`,
)
}
if (result.startsWith('LAUNCH_FAILED')) {
throw new Error(
`open(): failed to launch '${name}' — process did not start (${result})`,
)
}
if (result.startsWith('HWND_NOT_FOUND')) {
throw new Error(
`open(): launched '${name}' but could not find its window HWND (${result})`,
)
}
const parts = result.trim().split('|')
const hwnd = parts[0]!.trim()
const pid = Number(parts[1])
if (hwnd && hwnd !== '0') {
// Bind to the launched window — all subsequent operations target this HWND
bindWindow(hwnd, pid)
}
},
getFrontmostApp(): FrontmostAppInfo | null {
try {
const out = ps(`${WIN32_TYPES}
$hwnd = [CuWin32]::GetForegroundWindow()
$procId = [uint32]0
[CuWin32]::GetWindowThreadProcessId($hwnd, [ref]$procId) | Out-Null
$proc = Get-Process -Id $procId -ErrorAction SilentlyContinue
"$($proc.MainModule.FileName)|$($proc.ProcessName)"`)
if (!out || !out.includes('|')) return null
const [exePath, appName] = out.split('|', 2)
return { id: exePath!, appName: appName! }
} catch {
return null
}
},
findWindowByTitle(title): WindowHandle | null {
const windows = listWindows()
const found = windows.find(w => w.title.includes(title))
if (!found) return null
return { id: String(found.hwnd), pid: found.pid, title: found.title }
},
}
// ---------------------------------------------------------------------------
// Window Management — Win32 API calls targeted at bound HWND.
// NO global shortcuts (Win+Down, Alt+F4, etc.)
// Uses ShowWindow, SetWindowPos, SendMessage(WM_CLOSE) directly.
// ---------------------------------------------------------------------------
const WINDOW_MGMT_TYPES = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuWinMgmt {
[DllImport("user32.dll")]
public static extern bool ShowWindow(IntPtr hWnd, int nCmdShow);
[DllImport("user32.dll")]
public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint uFlags);
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr hWnd, out RECT lpRect);
[DllImport("user32.dll")]
public static extern bool SetForegroundWindow(IntPtr hWnd);
[DllImport("user32.dll")]
public static extern bool BringWindowToTop(IntPtr hWnd);
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMessage(IntPtr hWnd, uint Msg, IntPtr wParam, IntPtr lParam);
[DllImport("user32.dll")]
public static extern bool IsIconic(IntPtr hWnd);
[DllImport("user32.dll")]
public static extern bool IsZoomed(IntPtr hWnd);
[StructLayout(LayoutKind.Sequential)]
public struct RECT {
public int Left; public int Top; public int Right; public int Bottom;
}
// ShowWindow constants
public const int SW_MINIMIZE = 6;
public const int SW_MAXIMIZE = 3;
public const int SW_RESTORE = 9;
public const int SW_SHOWNOACTIVATE = 4;
public const int SW_SHOWMINNOACTIVE = 7;
// SetWindowPos flags
public const uint SWP_NOSIZE = 0x0001;
public const uint SWP_NOMOVE = 0x0002;
public const uint SWP_NOZORDER = 0x0004;
public const uint SWP_NOACTIVATE = 0x0010;
public const uint SWP_SHOWWINDOW = 0x0040;
// WM_CLOSE
public const uint WM_CLOSE = 0x0010;
// WM_SYSCOMMAND
public const uint WM_SYSCOMMAND = 0x0112;
public const int SC_MINIMIZE = 0xF020;
public const int SC_MAXIMIZE = 0xF030;
public const int SC_RESTORE = 0xF120;
public const int SC_CLOSE = 0xF060;
}
'@
`
import type { WindowManagementPlatform, WindowAction } from './types.js'
const windowManagement: WindowManagementPlatform = {
manageWindow(action: WindowAction, opts?): boolean {
if (!boundHwnd) return false
const hwnd = boundHwnd
switch (action) {
case 'minimize': {
// ShowWindow(SW_MINIMIZE) — targeted at HWND, not global
const r = ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_SHOWMINNOACTIVE)`,
)
return r !== ''
}
case 'maximize': {
const r = ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_MAXIMIZE)`,
)
return r !== ''
}
case 'restore': {
const r = ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::ShowWindow([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::SW_RESTORE)`,
)
return r !== ''
}
case 'close': {
// SendMessage(WM_CLOSE) — graceful close targeted at HWND
// Also clean up border overlay
unmarkBound(hwnd)
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SendMessage([IntPtr]::new([long]${hwnd}), [CuWinMgmt]::WM_CLOSE, [IntPtr]::Zero, [IntPtr]::Zero)`,
)
unbindWindow()
return true
}
case 'focus': {
// Restore if minimized, then bring to front
ps(`${WINDOW_MGMT_TYPES}
$h = [IntPtr]::new([long]${hwnd})
if ([CuWinMgmt]::IsIconic($h)) {
[CuWinMgmt]::ShowWindow($h, [CuWinMgmt]::SW_RESTORE) | Out-Null
}
[CuWinMgmt]::SetForegroundWindow($h) | Out-Null
[CuWinMgmt]::BringWindowToTop($h) | Out-Null
`)
return true
}
case 'move_offscreen': {
// Move to -32000,-32000 — keeps window in restored state for SendMessage/PrintWindow
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, -32000, -32000, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
)
return true
}
case 'move_resize': {
if (opts?.x !== undefined && opts?.y !== undefined) {
this.moveResize(opts.x, opts.y, opts.width, opts.height)
}
return true
}
case 'get_rect': {
// get_rect is handled separately by getWindowRect(), not through manageWindow
// Return true to indicate the action is recognized
return true
}
default:
return false
}
},
moveResize(x: number, y: number, width?: number, height?: number): boolean {
if (!boundHwnd) return false
const hwnd = boundHwnd
if (width !== undefined && height !== undefined) {
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, ${width}, ${height}, [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
)
} else {
ps(
`${WINDOW_MGMT_TYPES}; [CuWinMgmt]::SetWindowPos([IntPtr]::new([long]${hwnd}), [IntPtr]::Zero, ${x}, ${y}, 0, 0, [CuWinMgmt]::SWP_NOSIZE -bor [CuWinMgmt]::SWP_NOZORDER -bor [CuWinMgmt]::SWP_NOACTIVATE)`,
)
}
return true
},
getWindowRect(): {
x: number
y: number
width: number
height: number
} | null {
if (!boundHwnd) return null
const out = ps(`${WINDOW_MGMT_TYPES}
$rect = New-Object CuWinMgmt+RECT
if ([CuWinMgmt]::GetWindowRect([IntPtr]::new([long]${boundHwnd}), [ref]$rect)) {
"$($rect.Left),$($rect.Top),$($rect.Right),$($rect.Bottom)"
} else { "FAIL" }
`)
if (!out || out === 'FAIL') return null
const [l, t, r, b] = out.split(',').map(Number)
return { x: l, y: t, width: r - l, height: b - t }
},
}
// ---------------------------------------------------------------------------
// Export
// ---------------------------------------------------------------------------
// Clean up all overlays on process exit
function cleanupAll() {
cleanupAllBorders()
hideVirtualCursor()
hideIndicator()
// Stop the Python bridge subprocess if it was started
try {
getBridge()?.stopBridge()
} catch {}
}
process.on('exit', cleanupAll)
process.on('SIGINT', () => {
cleanupAll()
process.exit()
})
process.on('SIGTERM', () => {
cleanupAll()
process.exit()
})
export const platform: Platform = {
input,
screenshot,
display,
apps,
windowManagement,
}

View File

@@ -3,21 +3,16 @@ import type { ComputerUseAPI } from '@ant/computer-use-swift'
let cached: ComputerUseAPI | undefined
/**
* Package's js/index.js reads COMPUTER_USE_SWIFT_NODE_PATH (baked by
* build-with-plugins.ts on darwin targets, unset otherwise — falls through to
* the node_modules prebuilds/ path). We cache the loaded native module.
*
* The four @MainActor methods (captureExcluding, captureRegion,
* apps.listInstalled, resolvePrepareCapture) dispatch to DispatchQueue.main
* and will hang under libuv unless CFRunLoop is pumped — call sites wrap
* these in drainRunLoop().
* macOS-only loader for @ant/computer-use-swift.
* Non-darwin platforms should use src/utils/computerUse/platforms/ instead.
*/
export function requireComputerUseSwift(): ComputerUseAPI {
if (process.platform !== 'darwin') {
throw new Error('@ant/computer-use-swift is macOS-only. Use platforms/ for cross-platform.')
}
if (cached) return cached
// eslint-disable-next-line @typescript-eslint/no-require-imports
const mod = require('@ant/computer-use-swift')
// macOS native .node exports a plain object with apps/display/screenshot directly.
// Our cross-platform package exports { ComputerUseAPI } class — needs instantiation.
if (mod.ComputerUseAPI && typeof mod.ComputerUseAPI === 'function') {
cached = new mod.ComputerUseAPI() as ComputerUseAPI
} else {

View File

@@ -0,0 +1,225 @@
/**
* Accessibility Snapshot — captures the UI Automation tree of a window
* and formats it as compact, model-friendly text.
*
* Sent alongside screenshots so the model has BOTH visual + structural
* understanding of the GUI. This enables:
* - Knowing exact element names, types, and positions
* - Using click_element/type_into_element by name instead of pixel coords
* - Understanding disabled/enabled state, current values
*
* Only includes interactive elements (buttons, edits, menus, links, etc.)
* to keep token count low (~200-500 tokens for typical windows).
*/
import { validateHwnd, ps } from './shared.js'
export interface AccessibilityNode {
role: string // Button, Edit, MenuItem, Link, Text, CheckBox, etc.
name: string // Visible text / accessible name
automationId: string
bounds: { x: number; y: number; w: number; h: number }
enabled: boolean
value?: string // Current text value (for Edit/ComboBox)
children?: AccessibilityNode[]
}
export interface AccessibilitySnapshot {
/** Compact text representation for the model */
text: string
/** Structured tree (for element-targeted actions) */
nodes: AccessibilityNode[]
/** Capture timestamp */
timestamp: number
}
/**
* Capture the accessibility tree of a window, returning only interactive
* and visible elements. Uses Windows UI Automation (crosses process boundaries).
*
* @param hwnd - Window handle as string
* @param maxDepth - Maximum tree depth (default 4)
* @param interactiveOnly - Only include interactive elements (default true)
*/
export function captureAccessibilitySnapshot(
hwnd: string,
maxDepth: number = 4,
interactiveOnly: boolean = true,
): AccessibilitySnapshot | null {
hwnd = validateHwnd(hwnd)
const filterClause = interactiveOnly
? `
# Interactive control types only
$interactiveTypes = @(
'Button','Edit','ComboBox','CheckBox','RadioButton',
'MenuItem','Menu','MenuBar','Link','Slider','Spinner',
'Tab','TabItem','List','ListItem','Tree','TreeItem',
'DataGrid','DataItem','Document','ScrollBar','ToolBar',
'SplitButton','ToggleButton','Hyperlink'
)
function Is-Interactive($ct) {
$typeName = $ct -replace 'ControlType\\.', ''
return $interactiveTypes -contains $typeName
}`
: `
function Is-Interactive($ct) { return $true }`
const script = `
Add-Type -AssemblyName UIAutomationClient
Add-Type -AssemblyName UIAutomationTypes
Add-Type -AssemblyName WindowsBase
${filterClause}
function Get-Tree($el, $depth, $maxDepth) {
if ($depth -ge $maxDepth) { return @() }
$result = @()
$children = $el.FindAll(
[System.Windows.Automation.TreeScope]::Children,
[System.Windows.Automation.Condition]::TrueCondition)
foreach ($child in $children) {
$ct = $child.Current.ControlType.ProgrammaticName
$typeName = $ct -replace 'ControlType\\.', ''
$name = [string]$child.Current.Name
$autoId = [string]$child.Current.AutomationId
$rect = $child.Current.BoundingRectangle
$enabled = $child.Current.IsEnabled
# Skip invisible/offscreen elements
if ($rect.Width -le 0 -or $rect.Height -le 0) { continue }
if ($rect.X -lt -10000) { continue }
$val = $null
try {
$vp = $child.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
if ($vp -ne $null) { $val = $vp.Current.Value }
} catch {}
$isInteractive = Is-Interactive $ct
$sub = Get-Tree $child ($depth + 1) $maxDepth
if ($isInteractive -or $sub.Count -gt 0) {
$node = @{
role = $typeName
name = $name
id = $autoId
x = [int]$rect.X; y = [int]$rect.Y
w = [int]$rect.Width; h = [int]$rect.Height
on = $enabled
}
if ($val -ne $null -and $val -ne '') { $node['v'] = $val }
if ($sub.Count -gt 0) { $node['c'] = $sub }
$result += $node
}
}
return $result
}
try {
$root = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${hwnd}))
if ($root -eq $null) { Write-Output '[]'; exit }
$tree = Get-Tree $root 0 ${maxDepth}
if ($tree -eq $null -or $tree.Count -eq 0) {
Write-Output '[]'
} else {
$tree | ConvertTo-Json -Depth 20 -Compress
}
} catch {
Write-Output '[]'
}
`
try {
const raw = ps(script)
if (!raw || raw === '[]') return null
const parsed = JSON.parse(raw)
const nodes: AccessibilityNode[] = Array.isArray(parsed)
? parsed.map(parseNode)
: [parseNode(parsed)]
const text = formatForModel(nodes)
return { text, nodes, timestamp: Date.now() }
} catch {
return null
}
}
function parseNode(raw: any): AccessibilityNode {
return {
role: raw.role || '',
name: raw.name || '',
automationId: raw.id || '',
bounds: { x: raw.x || 0, y: raw.y || 0, w: raw.w || 0, h: raw.h || 0 },
enabled: raw.on !== false,
value: raw.v,
children: raw.c
? Array.isArray(raw.c)
? raw.c.map(parseNode)
: [parseNode(raw.c)]
: undefined,
}
}
/**
* Format the accessibility tree as compact text for the model.
* Example output:
* [Button] "Save" (120,50 80x30) enabled
* [Edit] "" (200,80 400x25) enabled value="hello world" id=textBox1
* [MenuItem] "File" (10,0 40x25) enabled
*/
function formatForModel(
nodes: AccessibilityNode[],
indent: number = 0,
): string {
const lines: string[] = []
const pad = ' '.repeat(indent)
for (const node of nodes) {
let line = `${pad}[${node.role}]`
if (node.name) line += ` "${truncate(node.name, 40)}"`
line += ` (${node.bounds.x},${node.bounds.y} ${node.bounds.w}x${node.bounds.h})`
if (!node.enabled) line += ' DISABLED'
if (node.value) line += ` value="${truncate(node.value, 30)}"`
if (node.automationId) line += ` id=${node.automationId}`
lines.push(line)
if (node.children) {
lines.push(formatForModel(node.children, indent + 1))
}
}
return lines.join('\n')
}
function truncate(s: string, max: number): string {
return s.length > max ? s.slice(0, max - 1) + '…' : s
}
/**
* Find an element in the accessibility tree by name, role, or automationId.
* Returns the first match.
*/
export function findNodeInSnapshot(
nodes: AccessibilityNode[],
query: { name?: string; role?: string; automationId?: string },
): AccessibilityNode | null {
for (const node of nodes) {
let match = true
if (
query.name &&
!node.name.toLowerCase().includes(query.name.toLowerCase())
)
match = false
if (query.role && node.role.toLowerCase() !== query.role.toLowerCase())
match = false
if (query.automationId && node.automationId !== query.automationId)
match = false
if (match && (query.name || query.role || query.automationId)) return node
if (node.children) {
const found = findNodeInSnapshot(node.children, query)
if (found) return found
}
}
return null
}

View File

@@ -0,0 +1,129 @@
/**
* Application type dispatcher for Windows Computer Use.
*
* Routes operations to the appropriate controller based on file type:
* - .xlsx/.xls/.csv → Excel COM (headless, no window)
* - .docx/.doc → Word COM (headless, no window)
* - .txt/.log/.md → notepad + SendMessage + HWND bind (offscreen)
* - Others → generic exe + HWND bind (offscreen)
*/
import { extname } from 'path'
export type AppType = 'excel' | 'word' | 'text' | 'browser' | 'generic'
const EXCEL_EXTS = new Set(['.xlsx', '.xls', '.csv', '.xlsm', '.xlsb'])
const WORD_EXTS = new Set(['.docx', '.doc', '.rtf'])
const TEXT_EXTS = new Set([
'.txt',
'.log',
'.md',
'.json',
'.xml',
'.yaml',
'.yml',
'.ini',
'.cfg',
'.conf',
])
const BROWSER_NAMES = new Set(['chrome', 'msedge', 'firefox', 'brave', 'opera'])
/**
* Detect application type from file path or app name.
*/
export function detectAppType(nameOrPath: string): AppType {
const lower = nameOrPath.toLowerCase()
// Check by extension
const ext = extname(lower)
if (ext) {
if (EXCEL_EXTS.has(ext)) return 'excel'
if (WORD_EXTS.has(ext)) return 'word'
if (TEXT_EXTS.has(ext)) return 'text'
}
// Check by app name
const baseName =
lower
.replace(/\.exe$/, '')
.split(/[/\\]/)
.pop() ?? ''
if (baseName === 'excel' || baseName.includes('excel')) return 'excel'
if (
baseName === 'winword' ||
baseName === 'word' ||
baseName.includes('word')
)
return 'word'
if (baseName === 'notepad' || baseName === 'notepad++' || baseName === 'code')
return 'text'
if (BROWSER_NAMES.has(baseName)) return 'browser'
return 'generic'
}
export interface OpenResult {
type: AppType
/** HWND for text/browser/generic apps (SendMessage target) */
hwnd?: string
/** File path for COM-controlled apps (Excel/Word) */
filePath?: string
}
/**
* Open a file or app with the appropriate controller.
*
* - Excel/Word: COM automation (no window, no HWND needed)
* - Text/Browser/Generic: exe launch + offscreen HWND bind
*
* Returns the app type and either HWND or file path for subsequent operations.
*/
export async function openWithController(
nameOrPath: string,
): Promise<OpenResult> {
const type = detectAppType(nameOrPath)
switch (type) {
case 'excel': {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { createExcel, openExcel } =
require('./comExcel.js') as typeof import('./comExcel.js')
const isExisting = nameOrPath.match(/\.(xlsx|xls|csv|xlsm|xlsb)$/i)
if (isExisting) {
// Open existing file — just verify it's readable
try {
openExcel(nameOrPath)
return { type: 'excel', filePath: nameOrPath }
} catch {
return { type: 'excel', filePath: nameOrPath }
}
}
// "excel" or "excel.exe" without a file — create new
const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.xlsx`
createExcel(tmpPath)
return { type: 'excel', filePath: tmpPath }
}
case 'word': {
// eslint-disable-next-line @typescript-eslint/no-require-imports
const { createWord, openWord } =
require('./comWord.js') as typeof import('./comWord.js')
const isExisting = nameOrPath.match(/\.(docx|doc|rtf)$/i)
if (isExisting) {
try {
openWord(nameOrPath)
return { type: 'word', filePath: nameOrPath }
} catch {
return { type: 'word', filePath: nameOrPath }
}
}
const tmpPath = `${process.env.TEMP || '/tmp'}\\cu_new_${Date.now()}.docx`
createWord(tmpPath)
return { type: 'word', filePath: tmpPath }
}
default:
// text/browser/generic — HWND bind handled by caller (platforms/win32.ts open())
return { type }
}
}

View File

@@ -0,0 +1,525 @@
"""
Python Bridge for Windows Computer Use.
Long-lived subprocess communicating via stdin/stdout JSON lines.
Replaces per-call PowerShell spawning with a persistent process.
Capabilities:
- screenshot: full-screen or per-window (mss + PrintWindow)
- input: mouse click/move/drag, keyboard type/key (ctypes user32)
- windows: enumerate, find, get rect, manage (show/min/max/close)
- accessibility: UI Automation tree snapshot (comtypes + UIAutomation)
Protocol: one JSON object per line on stdin → one JSON object per line on stdout.
Request: {"id": 1, "method": "screenshot", "params": {...}}
Response: {"id": 1, "result": {...}} or {"id": 1, "error": "message"}
"""
import sys
import json
import base64
import io
import ctypes
import ctypes.wintypes
import time
import os
# Force UTF-8 output
sys.stdout.reconfigure(encoding='utf-8')
sys.stdin.reconfigure(encoding='utf-8')
user32 = ctypes.windll.user32
gdi32 = ctypes.windll.gdi32
kernel32 = ctypes.windll.kernel32
# ---------------------------------------------------------------------------
# Win32 constants & types
# ---------------------------------------------------------------------------
WM_CHAR = 0x0102
WM_KEYDOWN = 0x0100
WM_KEYUP = 0x0101
WM_CLOSE = 0x0010
WM_LBUTTONDOWN = 0x0201
WM_LBUTTONUP = 0x0202
WM_RBUTTONDOWN = 0x0204
WM_RBUTTONUP = 0x0205
WM_MOUSEMOVE = 0x0200
SW_MINIMIZE = 6
SW_MAXIMIZE = 3
SW_RESTORE = 9
SW_SHOWMINNOACTIVE = 7
SWP_NOSIZE = 0x0001
SWP_NOMOVE = 0x0002
SWP_NOZORDER = 0x0004
SWP_NOACTIVATE = 0x0010
WNDENUMPROC = ctypes.WINFUNCTYPE(ctypes.c_bool, ctypes.c_void_p, ctypes.c_void_p)
class RECT(ctypes.Structure):
_fields_ = [("left", ctypes.c_long), ("top", ctypes.c_long),
("right", ctypes.c_long), ("bottom", ctypes.c_long)]
class POINT(ctypes.Structure):
_fields_ = [("x", ctypes.c_long), ("y", ctypes.c_long)]
# SendMessageW
SendMessageW = user32.SendMessageW
SendMessageW.argtypes = [ctypes.c_void_p, ctypes.c_uint, ctypes.c_void_p, ctypes.c_void_p]
SendMessageW.restype = ctypes.c_void_p
# ---------------------------------------------------------------------------
# Screenshot
# ---------------------------------------------------------------------------
def screenshot_full(display_id=0):
"""Full-screen screenshot via mss, returns JPEG base64."""
import mss
from PIL import Image
with mss.mss() as sct:
monitor = sct.monitors[display_id + 1] if display_id < len(sct.monitors) - 1 else sct.monitors[1]
shot = sct.grab(monitor)
img = Image.frombytes('RGB', shot.size, shot.bgra, 'raw', 'BGRX')
buf = io.BytesIO()
img.save(buf, format='JPEG', quality=75)
return {
'base64': base64.b64encode(buf.getvalue()).decode(),
'width': shot.width,
'height': shot.height,
}
def screenshot_window(hwnd_str):
"""Window screenshot via PrintWindow, returns JPEG base64."""
from PIL import Image
hwnd = int(hwnd_str)
if not user32.IsWindow(hwnd):
return None
# Get window rect
rect = RECT()
user32.GetWindowRect(hwnd, ctypes.byref(rect))
w = rect.right - rect.left
h = rect.bottom - rect.top
if w <= 0 or h <= 0:
return None
# Handle minimized windows
was_minimized = user32.IsIconic(hwnd)
if was_minimized:
user32.ShowWindow(hwnd, 4) # SW_SHOWNOACTIVATE
time.sleep(0.1)
user32.GetWindowRect(hwnd, ctypes.byref(rect))
w = rect.right - rect.left
h = rect.bottom - rect.top
# Create DC and bitmap
hdc_window = user32.GetDC(hwnd)
hdc_mem = gdi32.CreateCompatibleDC(hdc_window)
hbm = gdi32.CreateCompatibleBitmap(hdc_window, w, h)
gdi32.SelectObject(hdc_mem, hbm)
# PrintWindow with PW_RENDERFULLCONTENT
result = ctypes.windll.user32.PrintWindow(hwnd, hdc_mem, 2)
if not result:
# Fallback to BitBlt
gdi32.BitBlt(hdc_mem, 0, 0, w, h, hdc_window, 0, 0, 0x00CC0020) # SRCCOPY
# Extract bitmap bits
class BITMAPINFOHEADER(ctypes.Structure):
_fields_ = [
('biSize', ctypes.c_uint32), ('biWidth', ctypes.c_int32),
('biHeight', ctypes.c_int32), ('biPlanes', ctypes.c_uint16),
('biBitCount', ctypes.c_uint16), ('biCompression', ctypes.c_uint32),
('biSizeImage', ctypes.c_uint32), ('biXPelsPerMeter', ctypes.c_int32),
('biYPelsPerMeter', ctypes.c_int32), ('biClrUsed', ctypes.c_uint32),
('biClrImportant', ctypes.c_uint32),
]
bmi = BITMAPINFOHEADER()
bmi.biSize = ctypes.sizeof(BITMAPINFOHEADER)
bmi.biWidth = w
bmi.biHeight = -h # top-down
bmi.biPlanes = 1
bmi.biBitCount = 32
bmi.biCompression = 0 # BI_RGB
buf_size = w * h * 4
pixel_buf = ctypes.create_string_buffer(buf_size)
gdi32.GetDIBits(hdc_mem, hbm, 0, h, pixel_buf, ctypes.byref(bmi), 0)
# Cleanup GDI
gdi32.DeleteObject(hbm)
gdi32.DeleteDC(hdc_mem)
user32.ReleaseDC(hwnd, hdc_window)
if was_minimized:
user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE)
# Convert to JPEG
img = Image.frombuffer('RGBA', (w, h), pixel_buf, 'raw', 'BGRA', 0, 1)
img = img.convert('RGB')
out = io.BytesIO()
img.save(out, format='JPEG', quality=75)
return {
'base64': base64.b64encode(out.getvalue()).decode(),
'width': w,
'height': h,
}
# ---------------------------------------------------------------------------
# Window management
# ---------------------------------------------------------------------------
def list_windows():
"""Enumerate all visible windows with title."""
windows = []
def cb(hwnd, _):
if user32.IsWindowVisible(hwnd):
length = user32.GetWindowTextLengthW(hwnd)
if length > 0:
buf = ctypes.create_unicode_buffer(length + 1)
user32.GetWindowTextW(hwnd, buf, length + 1)
pid = ctypes.c_uint32()
user32.GetWindowThreadProcessId(hwnd, ctypes.byref(pid))
windows.append({'hwnd': str(hwnd), 'pid': pid.value, 'title': buf.value})
return True
user32.EnumWindows(WNDENUMPROC(cb), 0)
return windows
def get_window_rect(hwnd_str):
hwnd = int(hwnd_str)
rect = RECT()
if user32.GetWindowRect(hwnd, ctypes.byref(rect)):
return {'x': rect.left, 'y': rect.top,
'width': rect.right - rect.left, 'height': rect.bottom - rect.top}
return None
def get_client_offset(hwnd_str):
"""Get non-client area offset (title bar height, border width)."""
hwnd = int(hwnd_str)
wr = RECT()
user32.GetWindowRect(hwnd, ctypes.byref(wr))
pt = POINT(0, 0)
user32.ClientToScreen(hwnd, ctypes.byref(pt))
return {'dx': pt.x - wr.left, 'dy': pt.y - wr.top}
def manage_window(hwnd_str, action):
hwnd = int(hwnd_str)
if action == 'minimize':
return user32.ShowWindow(hwnd, SW_SHOWMINNOACTIVE)
elif action == 'maximize':
return user32.ShowWindow(hwnd, SW_MAXIMIZE)
elif action == 'restore':
return user32.ShowWindow(hwnd, SW_RESTORE)
elif action == 'close':
SendMessageW(hwnd, WM_CLOSE, 0, 0)
return True
elif action == 'focus':
if user32.IsIconic(hwnd):
user32.ShowWindow(hwnd, SW_RESTORE)
user32.SetForegroundWindow(hwnd)
return True
elif action == 'move_offscreen':
user32.SetWindowPos(hwnd, 0, -32000, -32000, 0, 0,
SWP_NOSIZE | SWP_NOZORDER | SWP_NOACTIVATE)
return True
return False
# ---------------------------------------------------------------------------
# Input — all via SendMessageW (window-targeted, no global)
# ---------------------------------------------------------------------------
def make_lparam(x, y):
return (y << 16) | (x & 0xFFFF)
def send_click(hwnd_str, x, y, button='left'):
hwnd = int(hwnd_str)
lp = make_lparam(x, y)
if button == 'left':
SendMessageW(hwnd, WM_LBUTTONDOWN, 0, lp)
SendMessageW(hwnd, WM_LBUTTONUP, 0, lp)
elif button == 'right':
SendMessageW(hwnd, WM_RBUTTONDOWN, 0, lp)
SendMessageW(hwnd, WM_RBUTTONUP, 0, lp)
return True
def send_text(hwnd_str, text):
"""Send text via WM_CHAR (Unicode). Handles surrogate pairs."""
hwnd = int(hwnd_str)
for ch in text:
cp = ord(ch)
if cp <= 0xFFFF:
SendMessageW(hwnd, WM_CHAR, cp, 0)
else:
# Surrogate pair
hi = ((cp - 0x10000) >> 10) + 0xD800
lo = ((cp - 0x10000) & 0x3FF) + 0xDC00
SendMessageW(hwnd, WM_CHAR, hi, 0)
SendMessageW(hwnd, WM_CHAR, lo, 0)
return True
def send_key(hwnd_str, vk, action='down'):
hwnd = int(hwnd_str)
msg = WM_KEYDOWN if action == 'down' else WM_KEYUP
SendMessageW(hwnd, msg, vk, 0)
return True
def send_keys_combo(hwnd_str, keys):
"""Send a key combination like ['ctrl', 's']."""
VK = {
'ctrl': 0x11, 'control': 0x11, 'shift': 0x10, 'alt': 0x12,
'enter': 0x0D, 'return': 0x0D, 'tab': 0x09, 'escape': 0x1B,
'backspace': 0x08, 'delete': 0x2E, 'space': 0x20,
'left': 0x25, 'up': 0x26, 'right': 0x27, 'down': 0x28,
'home': 0x24, 'end': 0x23, 'pageup': 0x21, 'pagedown': 0x22,
'f1': 0x70, 'f2': 0x71, 'f3': 0x72, 'f4': 0x73, 'f5': 0x74,
'f6': 0x75, 'f7': 0x76, 'f8': 0x77, 'f9': 0x78, 'f10': 0x79,
'f11': 0x7A, 'f12': 0x7B,
}
MODIFIERS = {'ctrl', 'control', 'shift', 'alt'}
hwnd = int(hwnd_str)
mods = []
main_key = None
for k in keys:
kl = k.lower()
if kl in MODIFIERS:
mods.append(VK.get(kl, 0))
elif kl in VK:
main_key = VK[kl]
elif len(kl) == 1:
main_key = ord(kl.upper())
if main_key is None:
return False
for m in mods:
SendMessageW(hwnd, WM_KEYDOWN, m, 0)
SendMessageW(hwnd, WM_KEYDOWN, main_key, 0)
SendMessageW(hwnd, WM_KEYUP, main_key, 0)
for m in reversed(mods):
SendMessageW(hwnd, WM_KEYUP, m, 0)
return True
def send_mouse_down(hwnd_str, x, y):
hwnd = int(hwnd_str)
SendMessageW(hwnd, WM_LBUTTONDOWN, 0, make_lparam(x, y))
return True
def send_mouse_up(hwnd_str, x, y):
hwnd = int(hwnd_str)
SendMessageW(hwnd, WM_LBUTTONUP, 0, make_lparam(x, y))
return True
def send_mouse_move(hwnd_str, x, y):
hwnd = int(hwnd_str)
SendMessageW(hwnd, WM_MOUSEMOVE, 0, make_lparam(x, y))
return True
# ---------------------------------------------------------------------------
# Accessibility snapshot (UI Automation via comtypes)
# ---------------------------------------------------------------------------
_uia_client = None
def _get_uia():
global _uia_client
if _uia_client is None:
try:
import comtypes.client
comtypes.client.GetModule('UIAutomationCore.dll')
from comtypes.gen.UIAutomationClient import CUIAutomation
_uia_client = comtypes.client.CreateObject(CUIAutomation)
except Exception:
# Fallback: use pywinauto
pass
return _uia_client
def accessibility_snapshot(hwnd_str, max_depth=4):
"""Get the accessibility tree using pywinauto (more reliable than raw comtypes)."""
try:
from pywinauto import Desktop
from pywinauto.controls.uiawrapper import UIAWrapper
hwnd = int(hwnd_str)
app = Desktop(backend='uia')
# Find window by handle
win = None
for w in app.windows():
if w.handle == hwnd:
win = w
break
if win is None:
return None
INTERACTIVE = {'Button', 'Edit', 'ComboBox', 'CheckBox', 'RadioButton',
'MenuItem', 'Menu', 'MenuBar', 'Hyperlink', 'Slider',
'Tab', 'TabItem', 'List', 'ListItem', 'Document',
'TreeItem', 'DataItem', 'ToolBar', 'SplitButton'}
def walk(element, depth):
if depth >= max_depth:
return []
nodes = []
try:
children = element.children()
except Exception:
return []
for child in children:
try:
ct = child.element_info.control_type or ''
name = child.element_info.name or ''
auto_id = child.element_info.automation_id or ''
rect = child.rectangle()
w = rect.right - rect.left
h = rect.bottom - rect.top
if w <= 0 or h <= 0 or rect.left < -10000:
continue
enabled = child.is_enabled()
value = None
try:
value = child.get_value()
except Exception:
pass
sub = walk(child, depth + 1)
if ct in INTERACTIVE or sub:
node = {
'role': ct, 'name': name, 'id': auto_id,
'x': rect.left, 'y': rect.top, 'w': w, 'h': h,
'on': enabled,
}
if value:
node['v'] = str(value)[:100]
if sub:
node['c'] = sub
nodes.append(node)
except Exception:
continue
return nodes
tree = walk(win, 0)
return tree if tree else None
except Exception as e:
return None
# ---------------------------------------------------------------------------
# Find edit child (for text input targeting)
# ---------------------------------------------------------------------------
def find_edit_child(hwnd_str):
"""Find the best edit control child using UI Automation."""
try:
from pywinauto import Desktop
hwnd = int(hwnd_str)
app = Desktop(backend='uia')
for w in app.windows():
if w.handle == hwnd:
# Find first Edit or Document control
for child in w.descendants():
try:
ct = child.element_info.control_type
if ct in ('Edit', 'Document'):
return str(child.handle) if child.handle else None
except Exception:
continue
break
except Exception:
pass
return None
# ---------------------------------------------------------------------------
# Clipboard paste (for large text)
# ---------------------------------------------------------------------------
def paste_text(hwnd_str, text):
"""Set clipboard + send Ctrl+V via SendMessage."""
import ctypes
# Set clipboard
CF_UNICODETEXT = 13
user32.OpenClipboard(0)
user32.EmptyClipboard()
data = text.encode('utf-16-le') + b'\x00\x00'
h = kernel32.GlobalAlloc(0x0002, len(data)) # GMEM_MOVEABLE
ptr = kernel32.GlobalLock(h)
ctypes.memmove(ptr, data, len(data))
kernel32.GlobalUnlock(h)
user32.SetClipboardData(CF_UNICODETEXT, h)
user32.CloseClipboard()
# Send Ctrl+V
send_keys_combo(hwnd_str, ['ctrl', 'v'])
return True
# ---------------------------------------------------------------------------
# Mouse wheel scroll (WM_MOUSEWHEEL / WM_MOUSEHWHEEL)
# ---------------------------------------------------------------------------
WM_MOUSEWHEEL = 0x020A
WM_MOUSEHWHEEL = 0x020E
# ClientToScreen for screen coords in lParam
user32.ClientToScreen.argtypes = [ctypes.c_void_p, ctypes.POINTER(POINT)]
user32.ClientToScreen.restype = ctypes.c_bool
def send_mouse_wheel(hwnd_str, x, y, delta, horizontal=False):
"""Send mouse wheel scroll at client coordinates (x, y).
delta: positive = up/right, negative = down/left. In "clicks" (1 click = 120 units).
"""
hwnd = int(hwnd_str)
msg = WM_MOUSEHWHEEL if horizontal else WM_MOUSEWHEEL
wheel_delta = int(delta) * 120
# Convert client coords to screen coords for lParam
pt = POINT(int(x), int(y))
user32.ClientToScreen(hwnd, ctypes.byref(pt))
# wParam: high word = delta (signed short), low word = modifier keys (0)
wparam = ctypes.c_void_p(wheel_delta << 16)
# lParam: screen coords
lparam = ctypes.c_void_p((pt.y << 16) | (pt.x & 0xFFFF))
SendMessageW(hwnd, msg, wparam, lparam)
return True
# ---------------------------------------------------------------------------
# Dispatch
# ---------------------------------------------------------------------------
METHODS = {
'screenshot': lambda p: screenshot_full(p.get('display_id', 0)),
'screenshot_window': lambda p: screenshot_window(p['hwnd']),
'list_windows': lambda p: list_windows(),
'get_window_rect': lambda p: get_window_rect(p['hwnd']),
'get_client_offset': lambda p: get_client_offset(p['hwnd']),
'manage_window': lambda p: manage_window(p['hwnd'], p['action']),
'send_click': lambda p: send_click(p['hwnd'], p['x'], p['y'], p.get('button', 'left')),
'send_text': lambda p: send_text(p['hwnd'], p['text']),
'send_key': lambda p: send_key(p['hwnd'], p['vk'], p.get('action', 'down')),
'send_keys': lambda p: send_keys_combo(p['hwnd'], p['keys']),
'send_mouse_down': lambda p: send_mouse_down(p['hwnd'], p['x'], p['y']),
'send_mouse_up': lambda p: send_mouse_up(p['hwnd'], p['x'], p['y']),
'send_mouse_move': lambda p: send_mouse_move(p['hwnd'], p['x'], p['y']),
'paste_text': lambda p: paste_text(p['hwnd'], p['text']),
'send_mouse_wheel': lambda p: send_mouse_wheel(p['hwnd'], p['x'], p['y'], p['delta'], p.get('horizontal', False)),
'find_edit_child': lambda p: find_edit_child(p['hwnd']),
'accessibility_snapshot': lambda p: accessibility_snapshot(p['hwnd'], p.get('max_depth', 4)),
'ping': lambda p: {'ok': True, 'pid': os.getpid()},
}
def main():
"""Main loop: read JSON lines from stdin, dispatch, write JSON lines to stdout."""
for line in sys.stdin:
line = line.strip()
if not line:
continue
try:
req = json.loads(line)
req_id = req.get('id', 0)
method = req.get('method', '')
params = req.get('params', {})
if method not in METHODS:
resp = {'id': req_id, 'error': f'unknown method: {method}'}
else:
try:
result = METHODS[method](params)
resp = {'id': req_id, 'result': result}
except Exception as e:
resp = {'id': req_id, 'error': str(e)}
sys.stdout.write(json.dumps(resp, ensure_ascii=False) + '\n')
sys.stdout.flush()
except json.JSONDecodeError as e:
sys.stdout.write(json.dumps({'id': 0, 'error': f'invalid JSON: {e}'}) + '\n')
sys.stdout.flush()
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,191 @@
/**
* Python Bridge Client — manages a long-lived Python subprocess for Windows
* Computer Use operations.
*
* Replaces per-call PowerShell spawning with a persistent Python process
* that communicates via JSON lines over stdin/stdout.
*
* Performance: ~1-5ms per call vs ~200-500ms per PowerShell spawn.
*/
import * as path from 'path'
interface BridgeRequest {
id: number
method: string
params: Record<string, unknown>
}
interface BridgeResponse {
id: number
result?: unknown
error?: string
}
let bridgeProc: ReturnType<typeof Bun.spawn> | null = null
let requestId = 0
const pendingRequests = new Map<
number,
{
resolve: (value: unknown) => void
reject: (error: Error) => void
}
>()
let outputBuffer = ''
/**
* Start the Python bridge process if not already running.
*/
export function ensureBridge(): boolean {
if (bridgeProc) return true
try {
const scriptPath = path.join(__dirname, 'bridge.py')
bridgeProc = Bun.spawn(['python', '-u', scriptPath], {
stdin: 'pipe',
stdout: 'pipe',
stderr: 'ignore',
env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUNBUFFERED: '1' },
})
// Read stdout lines asynchronously
const reader = bridgeProc.stdout.getReader()
const readLoop = async () => {
try {
while (true) {
const { done, value } = await reader.read()
if (done) break
outputBuffer += new TextDecoder().decode(value)
// Process complete lines
let newlineIdx: number
while ((newlineIdx = outputBuffer.indexOf('\n')) !== -1) {
const line = outputBuffer.slice(0, newlineIdx).trim()
outputBuffer = outputBuffer.slice(newlineIdx + 1)
if (!line) continue
try {
const resp: BridgeResponse = JSON.parse(line)
const pending = pendingRequests.get(resp.id)
if (pending) {
pendingRequests.delete(resp.id)
if (resp.error) {
pending.reject(new Error(resp.error))
} else {
pending.resolve(resp.result)
}
}
} catch {}
}
}
} catch {}
}
readLoop()
return true
} catch {
bridgeProc = null
return false
}
}
/**
* Send a request to the Python bridge and wait for the response.
*/
export async function call<T = unknown>(
method: string,
params: Record<string, unknown> = {},
timeoutMs: number = 10000,
): Promise<T> {
if (!ensureBridge()) {
throw new Error('Python bridge not available')
}
const id = ++requestId
const req: BridgeRequest = { id, method, params }
return new Promise<T>((resolve, reject) => {
pendingRequests.set(id, {
resolve: resolve as (v: unknown) => void,
reject,
})
// Timeout
const timer = setTimeout(() => {
pendingRequests.delete(id)
reject(new Error(`Bridge call ${method} timed out after ${timeoutMs}ms`))
}, timeoutMs)
// Clear timeout on resolve/reject
const origResolve = resolve
const origReject = reject
pendingRequests.set(id, {
resolve: v => {
clearTimeout(timer)
;(origResolve as any)(v)
},
reject: e => {
clearTimeout(timer)
origReject(e)
},
})
try {
bridgeProc!.stdin.write(JSON.stringify(req) + '\n')
bridgeProc!.stdin.flush()
} catch (err) {
clearTimeout(timer)
pendingRequests.delete(id)
reject(new Error(`Bridge write failed: ${err}`))
}
})
}
/**
* Synchronous call — blocks the event loop. Use sparingly.
* Falls back to PowerShell if bridge is not available.
*/
export function callSync<T = unknown>(
method: string,
params: Record<string, unknown> = {},
timeoutMs: number = 10000,
): T | null {
// For sync calls, spawn a one-shot Python process.
// SECURITY: JSON is passed via stdin (not embedded in -c) to prevent code injection.
try {
const scriptPath = path.join(__dirname, 'bridge.py')
const req = JSON.stringify({ id: 1, method, params })
const result = Bun.spawnSync({
cmd: ['python', '-u', scriptPath],
stdin: Buffer.from(req + '\n'),
stdout: 'pipe',
stderr: 'pipe',
env: { ...process.env, PYTHONIOENCODING: 'utf-8' },
timeout: timeoutMs,
})
const out = new TextDecoder().decode(result.stdout).trim()
if (!out) return null
const resp: BridgeResponse = JSON.parse(out)
if (resp.error) throw new Error(resp.error)
return resp.result as T
} catch {
return null
}
}
/**
* Kill the bridge process.
*/
export function stopBridge(): void {
if (bridgeProc) {
try {
bridgeProc.stdin.end()
bridgeProc.kill()
} catch {}
bridgeProc = null
}
pendingRequests.clear()
outputBuffer = ''
}
// NOTE: No process exit handlers here — the platform-level win32.ts
// already registers exit/SIGINT/SIGTERM handlers that call cleanupAll(),
// which includes stopBridge(). Adding handlers here would cause double
// cleanup and duplicate process.exit() calls.

View File

@@ -0,0 +1,320 @@
/**
* Excel COM automation via PowerShell.
* Completely headless — Visible=false, no window, no user impact.
* Each operation opens and closes Excel to avoid orphaned processes.
*/
export interface CellInfo {
row: number
col: number
value: string | number | null
formula?: string
}
export interface SheetInfo {
name: string
usedRange: { rows: number; cols: number }
cells: CellInfo[]
}
export interface ExcelInfo {
sheets: SheetInfo[]
sheetNames: string[]
}
function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
const stderr = new TextDecoder().decode(result.stderr).trim()
if (result.exitCode !== 0 && stderr) {
throw new Error(`PowerShell error: ${stderr}`)
}
return new TextDecoder().decode(result.stdout).trim()
}
function escPath(p: string): string {
return p.replace(/'/g, "''")
}
function resolveSheet(varName: string, sheet: string | number): string {
if (typeof sheet === 'number') {
return `$${varName} = $wb.Sheets.Item(${sheet})`
}
return `$${varName} = $wb.Sheets.Item('${sheet.replace(/'/g, "''")}')`
}
const EXCEL_INIT = `
$excel = New-Object -ComObject Excel.Application
$excel.Visible = $false
$excel.DisplayAlerts = $false
`.trim()
function excelCleanup(hasWorkbook = true): string {
const parts: string[] = []
if (hasWorkbook) parts.push('if ($wb) { $wb.Close($false) }')
parts.push('$excel.Quit()')
parts.push('[System.Runtime.InteropServices.Marshal]::ReleaseComObject($excel) | Out-Null')
return parts.join('\n ')
}
/**
* Open and read an Excel workbook.
* Limits to first 1000 non-empty cells per sheet.
*/
export function openExcel(filePath: string): ExcelInfo {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
$result = @{ sheets = @(); sheetNames = @() }
foreach ($sheet in $wb.Sheets) {
$result.sheetNames += $sheet.Name
$ur = $sheet.UsedRange
$rows = $ur.Rows.Count
$cols = $ur.Columns.Count
$cells = @()
$count = 0
for ($r = 1; $r -le $rows -and $count -lt 1000; $r++) {
for ($c = 1; $c -le $cols -and $count -lt 1000; $c++) {
$cell = $sheet.Cells.Item($r, $c)
$val = $cell.Value2
if ($null -ne $val) {
$f = $null
if ($cell.HasFormula) { $f = $cell.Formula }
$entry = @{ row = $r; col = $c; value = $val }
if ($f) { $entry.formula = $f }
$cells += $entry
$count++
}
}
}
$result.sheets += @{
name = $sheet.Name
usedRange = @{ rows = $rows; cols = $cols }
cells = $cells
}
}
$result | ConvertTo-Json -Depth 5 -Compress
} finally {
${excelCleanup()}
}
`
const raw = ps(script)
if (!raw) throw new Error('No output from openExcel')
const parsed = JSON.parse(raw)
// Normalize: PowerShell single-element arrays become objects
const sheets: SheetInfo[] = Array.isArray(parsed.sheets) ? parsed.sheets : [parsed.sheets]
const sheetNames: string[] = Array.isArray(parsed.sheetNames) ? parsed.sheetNames : [parsed.sheetNames]
return {
sheets: sheets.map((s: any) => ({
name: s.name,
usedRange: s.usedRange,
cells: Array.isArray(s.cells) ? s.cells : s.cells ? [s.cells] : [],
})),
sheetNames,
}
}
/**
* Read a single cell value.
*/
export function readCell(
filePath: string,
sheet: string | number,
row: number,
col: number,
): string | number | null {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$val = $sheet.Cells.Item(${row}, ${col}).Value2
if ($null -eq $val) { Write-Output 'null' } else { Write-Output ($val | ConvertTo-Json -Compress) }
} finally {
${excelCleanup()}
}
`
const raw = ps(script)
if (raw === 'null' || raw === '') return null
return JSON.parse(raw)
}
/**
* Read a rectangular range of cells as a 2D array.
*/
export function readRange(
filePath: string,
sheet: string | number,
startRow: number,
startCol: number,
endRow: number,
endCol: number,
): (string | number | null)[][] {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$rows = @()
for ($r = ${startRow}; $r -le ${endRow}; $r++) {
$row = @()
for ($c = ${startCol}; $c -le ${endCol}; $c++) {
$val = $sheet.Cells.Item($r, $c).Value2
$row += if ($null -eq $val) { '__NULL__' } else { $val }
}
$rows += ,@($row)
}
$rows | ConvertTo-Json -Depth 3 -Compress
} finally {
${excelCleanup()}
}
`
const raw = ps(script)
if (!raw) return []
const parsed = JSON.parse(raw)
// Normalize single-row case
const rows: any[] = Array.isArray(parsed[0]) ? parsed : [parsed]
return rows.map((row: any[]) =>
row.map((v: any) => (v === '__NULL__' ? null : v)),
)
}
/**
* Write a single cell value.
*/
export function writeCell(
filePath: string,
sheet: string | number,
row: number,
col: number,
value: string | number,
): boolean {
const jsonVal = JSON.stringify(value)
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$sheet.Cells.Item(${row}, ${col}).Value2 = (ConvertFrom-Json '${jsonVal.replace(/'/g, "''")}')
$wb.Save()
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Write a 2D array of values starting at (startRow, startCol).
*/
export function writeRange(
filePath: string,
sheet: string | number,
startRow: number,
startCol: number,
data: (string | number | null)[][],
): boolean {
const jsonData = JSON.stringify(data).replace(/'/g, "''")
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$data = ConvertFrom-Json '${jsonData}'
for ($r = 0; $r -lt $data.Count; $r++) {
$row = $data[$r]
for ($c = 0; $c -lt $row.Count; $c++) {
$val = $row[$c]
if ($null -ne $val) {
if ($val -is [int] -or $val -is [long] -or $val -is [double] -or $val -is [decimal]) {
$sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [double]$val
} else {
$sheet.Cells.Item(${startRow} + $r, ${startCol} + $c).Value2 = [string]$val
}
}
}
}
$wb.Save()
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Set a formula on a cell.
*/
export function setFormula(
filePath: string,
sheet: string | number,
row: number,
col: number,
formula: string,
): boolean {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${resolveSheet('sheet', sheet)}
$sheet.Cells.Item(${row}, ${col}).Formula = '${formula.replace(/'/g, "''")}'
$wb.Save()
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Save workbook. If savePath is given, SaveAs to that path; otherwise Save in place.
*/
export function saveExcel(filePath: string, savePath?: string): boolean {
const saveCmd = savePath
? `$wb.SaveAs('${escPath(savePath)}')`
: '$wb.Save()'
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Open('${escPath(filePath)}')
${saveCmd}
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* Create a new empty workbook and save it to the given path.
*/
export function createExcel(savePath: string): boolean {
const script = `
${EXCEL_INIT}
try {
$wb = $excel.Workbooks.Add()
$wb.SaveAs('${escPath(savePath)}')
Write-Output 'true'
} finally {
${excelCleanup()}
}
`
return ps(script) === 'true'
}
/**
* closeExcel is a no-op since each operation opens and closes its own COM instance.
*/
export function closeExcel(_filePath: string): void {
// No-op: each function manages its own Excel lifecycle
}

View File

@@ -0,0 +1,450 @@
/**
* Word COM automation module for Windows.
* Uses PowerShell to drive Word.Application COM object — fully headless (Visible=false).
* Each function builds a PowerShell script, runs it via Bun.spawnSync, and parses JSON output.
*/
// ---------------------------------------------------------------------------
// Types
// ---------------------------------------------------------------------------
export interface WordParagraph {
text: string
bold?: boolean
italic?: boolean
fontSize?: number
}
export interface WordTable {
rows: number
cols: number
data: string[][]
}
export interface WordDocInfo {
text: string
paragraphs: WordParagraph[]
tables: WordTable[]
wordCount: number
pageCount: number
}
export interface AppendTextOptions {
bold?: boolean
italic?: boolean
fontSize?: number
fontName?: string
}
// ---------------------------------------------------------------------------
// PowerShell runner
// ---------------------------------------------------------------------------
function runPs(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
function parseJsonOutput<T>(raw: string, fallback: T): T {
if (!raw) return fallback
try {
return JSON.parse(raw) as T
} catch {
return fallback
}
}
/** Escape a string for safe embedding inside a PowerShell single-quoted string. */
function psEscape(s: string): string {
return s.replace(/'/g, "''")
}
// ---------------------------------------------------------------------------
// Word COM wrapper template
// ---------------------------------------------------------------------------
/**
* Wraps a Word COM script body with standard open/cleanup boilerplate.
* The body receives $word and $doc variables.
* If `openPath` is provided the document is opened; otherwise a new doc is created.
*/
function wrapWordScript(body: string, openPath?: string): string {
const openCmd = openPath
? `$doc = $word.Documents.Open('${psEscape(openPath)}')`
: '$doc = $word.Documents.Add()'
return `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
${openCmd}
${body}
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
}
/**
* Same as wrapWordScript but the body is responsible for saving before close.
* After body runs, $doc.Save() is called automatically.
*/
function wrapWordScriptWithSave(body: string, openPath: string): string {
return `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(openPath)}')
${body}
$doc.Save()
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
}
// ---------------------------------------------------------------------------
// 1. openWord
// ---------------------------------------------------------------------------
export async function openWord(filePath: string): Promise<WordDocInfo> {
const script = wrapWordScript(
`
# Paragraphs (limit 500)
$paras = @()
$paraCount = $doc.Paragraphs.Count
$limit = [Math]::Min($paraCount, 500)
for ($i = 1; $i -le $limit; $i++) {
$p = $doc.Paragraphs.Item($i)
$r = $p.Range
$paras += @{
text = $r.Text -replace '\\r$',''
bold = [bool]($r.Font.Bold -eq -1)
italic = [bool]($r.Font.Italic -eq -1)
fontSize = $r.Font.Size
}
}
# Tables
$tables = @()
foreach ($table in $doc.Tables) {
$rows = $table.Rows.Count
$cols = $table.Columns.Count
$data = @()
for ($r = 1; $r -le $rows; $r++) {
$row = @()
for ($c = 1; $c -le $cols; $c++) {
try {
$cellText = $table.Cell($r, $c).Range.Text
# Trim trailing \\r\\a that Word adds to cell text
$cellText = $cellText -replace '[\\r\\n\\a]+$',''
$row += $cellText
} catch {
$row += ''
}
}
$data += ,@($row)
}
$tables += @{ rows = $rows; cols = $cols; data = $data }
}
# Counts: wdStatisticWords=0, wdStatisticPages=2
$wordCount = $doc.ComputeStatistics(0)
$pageCount = $doc.ComputeStatistics(2)
$result = @{
text = $doc.Content.Text
paragraphs = $paras
tables = $tables
wordCount = $wordCount
pageCount = $pageCount
}
Write-Output (ConvertTo-Json $result -Depth 5 -Compress)
`,
filePath,
)
const raw = runPs(script)
return parseJsonOutput<WordDocInfo>(raw, {
text: '',
paragraphs: [],
tables: [],
wordCount: 0,
pageCount: 0,
})
}
// ---------------------------------------------------------------------------
// 2. readText
// ---------------------------------------------------------------------------
export async function readText(filePath: string): Promise<string> {
const script = wrapWordScript(
`Write-Output $doc.Content.Text`,
filePath,
)
return runPs(script)
}
// ---------------------------------------------------------------------------
// 3. appendText
// ---------------------------------------------------------------------------
export async function appendText(
filePath: string,
text: string,
opts?: AppendTextOptions,
): Promise<boolean> {
const fontSetup = opts
? [
opts.bold !== undefined ? `$sel.Font.Bold = ${opts.bold ? '-1' : '0'}` : '',
opts.italic !== undefined ? `$sel.Font.Italic = ${opts.italic ? '-1' : '0'}` : '',
opts.fontSize !== undefined ? `$sel.Font.Size = ${opts.fontSize}` : '',
opts.fontName ? `$sel.Font.Name = '${psEscape(opts.fontName)}'` : '',
]
.filter(Boolean)
.join('\n ')
: ''
const body = `
$sel = $word.Selection
$sel.EndKey(6) | Out-Null
${fontSetup}
$sel.TypeText('${psEscape(text)}')
`
const script = wrapWordScriptWithSave(body, filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 4. insertText
// ---------------------------------------------------------------------------
export async function insertText(
filePath: string,
paraIndex: number,
text: string,
): Promise<boolean> {
const body = `
$doc.Paragraphs.Item(${paraIndex}).Range.InsertBefore('${psEscape(text)}')
`
const script = wrapWordScriptWithSave(body, filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 5. findReplace
// ---------------------------------------------------------------------------
export async function findReplace(
filePath: string,
find: string,
replace: string,
replaceAll?: boolean,
): Promise<number> {
// wdReplaceAll=2, wdReplaceOne=1
const replaceConst = replaceAll !== false ? 2 : 1
const body = `
$content = $doc.Content
$findObj = $content.Find
$findObj.ClearFormatting()
$findObj.Replacement.ClearFormatting()
# Count replacements by iterating
$count = 0
$findObj.Text = '${psEscape(find)}'
$findObj.Replacement.Text = '${psEscape(replace)}'
$findObj.Forward = $true
$findObj.Wrap = 0
$findObj.Format = $false
$findObj.MatchCase = $false
$findObj.MatchWholeWord = $false
$findObj.MatchWildcards = $false
if (${replaceConst} -eq 2) {
# Count occurrences first using a clone of content
$range2 = $doc.Content.Duplicate
while ($range2.Find.Execute('${psEscape(find)}')) { $count++ }
# Now do the actual replace
$findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 2)
} else {
$found = $findObj.Execute('${psEscape(find)}', $false, $false, $false, $false, $false, $true, 0, $false, '${psEscape(replace)}', 1)
if ($found) { $count = 1 }
}
`
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(filePath)}')
${body}
$doc.Save()
Write-Output ('{"count":' + $count + '}')
} catch {
Write-Output '{"count":0}'
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ count: number }>(raw, { count: 0 }).count
}
// ---------------------------------------------------------------------------
// 6. insertTable
// ---------------------------------------------------------------------------
export async function insertTable(
filePath: string,
rows: number,
cols: number,
data: string[][],
): Promise<boolean> {
// Build PowerShell array literal for the data
const psData = data
.map(
(row) =>
',@(' + row.map((cell) => `'${psEscape(cell)}'`).join(',') + ')',
)
.join('\n ')
const body = `
$sel = $word.Selection
$sel.EndKey(6) | Out-Null
$table = $doc.Tables.Add($sel.Range, ${rows}, ${cols})
$data = @(${psData})
for ($r = 0; $r -lt $data.Count; $r++) {
for ($c = 0; $c -lt $data[$r].Count; $c++) {
$table.Cell($r + 1, $c + 1).Range.Text = $data[$r][$c]
}
}
`
const script = wrapWordScriptWithSave(body, filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 7. saveWord
// ---------------------------------------------------------------------------
export async function saveWord(
filePath: string,
savePath?: string,
): Promise<boolean> {
if (!savePath || savePath === filePath) {
const script = wrapWordScriptWithSave('', filePath)
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
const body = `$doc.SaveAs('${psEscape(savePath)}')`
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(filePath)}')
${body}
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 8. saveAsPdf
// ---------------------------------------------------------------------------
export async function saveAsPdf(
filePath: string,
pdfPath: string,
): Promise<boolean> {
// wdFormatPDF = 17
const body = `$doc.SaveAs2('${psEscape(pdfPath)}', 17)`
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Open('${psEscape(filePath)}')
${body}
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 9. createWord
// ---------------------------------------------------------------------------
export async function createWord(savePath: string): Promise<boolean> {
const script = `
$word = New-Object -ComObject Word.Application
$word.Visible = $false
$word.DisplayAlerts = 0
try {
$doc = $word.Documents.Add()
$doc.SaveAs('${psEscape(savePath)}')
Write-Output '{"ok":true}'
} catch {
Write-Output ('{"ok":false,"error":"' + ($_.Exception.Message -replace '"','\\"') + '"}')
} finally {
if ($doc -ne $null) { $doc.Close($false); }
if ($word -ne $null) { $word.Quit(); }
if ($word -ne $null) { [System.Runtime.InteropServices.Marshal]::ReleaseComObject($word) | Out-Null }
}
`
const raw = runPs(script)
return parseJsonOutput<{ ok: boolean }>(raw, { ok: false }).ok
}
// ---------------------------------------------------------------------------
// 10. closeWord (no-op)
// ---------------------------------------------------------------------------
/**
* closeWord is a no-op since each operation opens and closes its own COM instance.
*/
export function closeWord(_filePath: string): void {
// No-op: each function manages its own Word lifecycle
}

View File

@@ -0,0 +1,254 @@
/**
* Input Indicator — floating label showing what Computer Use is doing
* on the bound window.
*
* Displays a small overlay near the bottom of the bound window:
* ⌨ Typing "hello world..."
* 🖱 Click (120, 50)
* ⌨ Ctrl+S
* 📜 Scroll ↓ 3
* ✅ Done
*
* Auto-fades after 2 seconds of inactivity.
* Click-through, TOPMOST, no taskbar icon.
*/
import * as fs from 'fs'
import * as path from 'path'
import { validateHwnd, getTmpDir } from './shared.js'
const INDICATOR_WIDTH = 350
const INDICATOR_HEIGHT = 28
const FADE_AFTER_MS = 2000
const BG_COLOR = '30, 30, 30' // dark background
const TEXT_COLOR = '220, 220, 220' // light text
const ACCENT_COLOR = '80, 200, 80' // green accent for active
let indicatorProc: ReturnType<typeof Bun.spawn> | null = null
let stopFile: string | null = null
let scriptFile: string | null = null
let msgFile: string | null = null
function buildIndicatorScript(hwnd: string, sf: string): string {
const sfEsc = sf.replace(/\\/g, '\\\\')
return `
Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class Indicator {
[DllImport("user32.dll")] public static extern bool IsWindow(IntPtr h);
[DllImport("user32.dll",SetLastError=true)] public static extern int SetWindowLong(IntPtr h, int i, int v);
[DllImport("user32.dll",SetLastError=true)] public static extern int GetWindowLong(IntPtr h, int i);
[DllImport("user32.dll")] public static extern bool SetWindowPos(IntPtr h, IntPtr a, int x, int y, int w, int h2, uint f);
[DllImport("user32.dll")] public static extern bool GetWindowRect(IntPtr h, out RECT r);
[StructLayout(LayoutKind.Sequential)] public struct RECT { public int L,T,R,B; }
public const int GWL_EXSTYLE = -20;
public const int WS_EX_LAYERED = 0x80000;
public const int WS_EX_TRANSPARENT = 0x20;
public const int WS_EX_TOOLWINDOW = 0x80;
public const int WS_EX_NOACTIVATE = 0x08000000;
public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1);
public const uint SWP_NOACTIVATE = 0x0010;
public const uint SWP_SHOWWINDOW = 0x0040;
public static void MakeOverlay(IntPtr h) {
int ex = GetWindowLong(h, GWL_EXSTYLE);
ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE;
SetWindowLong(h, GWL_EXSTYLE, ex);
}
}
'@
$targetHwnd = [IntPtr]::new([long]${hwnd})
$stopFile = '${sfEsc}'
$msgFile = $stopFile + '.msg'
$form = New-Object System.Windows.Forms.Form
$form.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None
$form.ShowInTaskbar = $false
$form.TopMost = $true
$form.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual
$form.Size = New-Object System.Drawing.Size(${INDICATOR_WIDTH}, ${INDICATOR_HEIGHT})
$form.Location = New-Object System.Drawing.Point(-32000, -32000)
$form.BackColor = [System.Drawing.Color]::FromArgb(240, ${BG_COLOR})
$form.Opacity = 0.92
$label = New-Object System.Windows.Forms.Label
$label.Dock = [System.Windows.Forms.DockStyle]::Fill
$label.ForeColor = [System.Drawing.Color]::FromArgb(${TEXT_COLOR})
$label.Font = New-Object System.Drawing.Font("Segoe UI", 10, [System.Drawing.FontStyle]::Regular)
$label.TextAlign = [System.Drawing.ContentAlignment]::MiddleLeft
$label.Padding = New-Object System.Windows.Forms.Padding(8, 0, 8, 0)
$label.Text = ""
$form.Controls.Add($label)
$form.Show()
[Indicator]::MakeOverlay($form.Handle)
$script:lastMsg = ""
$script:lastMsgTime = [DateTime]::MinValue
$script:visible = $false
$timer = New-Object System.Windows.Forms.Timer
$timer.Interval = 50 # 20fps
$timer.Add_Tick({
if (-not [Indicator]::IsWindow($targetHwnd)) {
$timer.Stop(); $form.Close()
[System.Windows.Forms.Application]::ExitThread()
return
}
if (Test-Path $stopFile) {
$timer.Stop(); $form.Close()
try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {}
try { Remove-Item $msgFile -ErrorAction SilentlyContinue } catch {}
[System.Windows.Forms.Application]::ExitThread()
return
}
# Read new message
if (Test-Path $msgFile) {
try {
$msg = Get-Content $msgFile -Raw -Encoding UTF8 -ErrorAction SilentlyContinue
if ($msg) {
$script:lastMsg = $msg.Trim()
$script:lastMsgTime = [DateTime]::Now
Remove-Item $msgFile -ErrorAction SilentlyContinue
}
} catch {}
}
# Fade logic: hide after ${FADE_AFTER_MS}ms of no updates
$elapsed = ([DateTime]::Now - $script:lastMsgTime).TotalMilliseconds
if ($elapsed -gt ${FADE_AFTER_MS} -and $script:visible) {
$form.Visible = $false
$script:visible = $false
return
}
if ($elapsed -le ${FADE_AFTER_MS} -and $script:lastMsg -ne "") {
# Position at bottom-center of the bound window
$wr = New-Object Indicator+RECT
[Indicator]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null
$ww = $wr.R - $wr.L
$fx = $wr.L + [int](($ww - ${INDICATOR_WIDTH}) / 2)
$fy = $wr.B - ${INDICATOR_HEIGHT} - 8
$label.Text = $script:lastMsg
[Indicator]::SetWindowPos($form.Handle, [Indicator]::HWND_TOPMOST,
$fx, $fy, 0, 0,
0x0001 -bor [Indicator]::SWP_NOACTIVATE -bor [Indicator]::SWP_SHOWWINDOW) | Out-Null
$form.Visible = $true
$script:visible = $true
# Fade opacity near end
if ($elapsed -gt ${FADE_AFTER_MS * 0.7}) {
$form.Opacity = [Math]::Max(0.3, 0.92 * (1.0 - ($elapsed - ${FADE_AFTER_MS * 0.7}) / ${FADE_AFTER_MS * 0.3}))
} else {
$form.Opacity = 0.92
}
}
})
$timer.Start()
[System.Windows.Forms.Application]::Run()
`
}
/** Start the input indicator for a bound window */
export function showIndicator(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
hideIndicator()
try {
const tmpDir = getTmpDir()
const ts = Date.now()
stopFile = path.join(tmpDir, `cu_indicator_stop_${ts}`)
scriptFile = path.join(tmpDir, `cu_indicator_${ts}.ps1`)
msgFile = stopFile + '.msg'
fs.writeFileSync(scriptFile, buildIndicatorScript(hwnd, stopFile), 'utf-8')
indicatorProc = Bun.spawn(
[
'powershell',
'-NoProfile',
'-ExecutionPolicy',
'Bypass',
'-File',
scriptFile,
],
{ stdout: 'ignore', stderr: 'ignore' },
)
return true
} catch {
return false
}
}
/** Update the indicator message */
export function updateIndicator(message: string): void {
if (!msgFile) return
try {
fs.writeFileSync(msgFile, message, 'utf-8')
} catch {}
}
/** Hide and destroy the indicator */
export function hideIndicator(): void {
if (stopFile) {
try {
fs.writeFileSync(stopFile, 'STOP', 'utf-8')
} catch {}
setTimeout(() => {
try {
indicatorProc?.kill()
} catch {}
try {
if (scriptFile) fs.unlinkSync(scriptFile)
} catch {}
try {
if (stopFile) fs.unlinkSync(stopFile)
} catch {}
try {
if (msgFile) fs.unlinkSync(msgFile)
} catch {}
}, 2000)
}
indicatorProc = null
stopFile = null
scriptFile = null
msgFile = null
}
// ── Convenience methods for common actions ──
export function indicateTyping(text: string): void {
const preview = text.length > 30 ? text.slice(0, 30) + '...' : text
updateIndicator(`\u2328 Typing "${preview}"`)
}
export function indicateKey(combo: string): void {
updateIndicator(`\u2328 ${combo}`)
}
export function indicateClick(
x: number,
y: number,
button: string = 'left',
): void {
updateIndicator(
`\uD83D\uDDB1 ${button === 'right' ? 'Right-click' : 'Click'} (${x}, ${y})`,
)
}
export function indicateScroll(direction: string, amount: number): void {
const arrow =
direction === 'up'
? '\u2191'
: direction === 'down'
? '\u2193'
: direction === 'left'
? '\u2190'
: '\u2192'
updateIndicator(`\uD83D\uDCDC Scroll ${arrow} ${amount}`)
}
export function indicateDone(): void {
updateIndicator('\u2705 Done')
}

View File

@@ -3,6 +3,8 @@
* Captures a screen region or window, then runs WinRT OCR to extract text.
*/
import { ps as runPs } from './shared.js'
export interface OcrLine {
text: string
bounds: { x: number; y: number; w: number; h: number }
@@ -18,15 +20,6 @@ function emptyResult(language: string): OcrResult {
return { text: '', lines: [], language }
}
function runPs(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
/**
* PowerShell script that:
* 1. Screenshots a screen region using CopyFromScreen

View File

@@ -0,0 +1,127 @@
/**
* Shared utilities for win32 Computer Use modules.
* Single source of truth — no more duplication across files.
*/
/** Validate HWND is a pure numeric string — prevents PowerShell/Python injection. */
export function validateHwnd(hwnd: string): string {
if (!/^\d+$/.test(hwnd)) {
throw new Error(`Invalid HWND: "${hwnd}" — must be numeric`)
}
return hwnd
}
/** Run a PowerShell script synchronously, return stdout trimmed. */
export function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
/** Run a PowerShell script synchronously, return null on failure. */
export function runPs(script: string): string | null {
try {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
if (result.exitCode !== 0) return null
return new TextDecoder().decode(result.stdout).trim()
} catch {
return null
}
}
/** Run a PowerShell script asynchronously. */
export async function psAsync(script: string): Promise<string> {
const proc = Bun.spawn(
['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
{ stdout: 'pipe', stderr: 'pipe' },
)
const out = await new Response(proc.stdout).text()
await proc.exited
return out.trim()
}
/** Get the system temp directory. */
export function getTmpDir(): string {
return process.env.TEMP || process.env.TMP || '/tmp'
}
/** Virtual key code mapping — canonical, complete. */
export const VK_MAP: Record<string, number> = {
backspace: 0x08,
tab: 0x09,
enter: 0x0d,
return: 0x0d,
shift: 0x10,
lshift: 0xa0,
rshift: 0xa1,
ctrl: 0x11,
control: 0x11,
lcontrol: 0xa2,
rcontrol: 0xa3,
alt: 0x12,
option: 0x12,
menu: 0x12,
lalt: 0xa4,
ralt: 0xa5,
pause: 0x13,
capslock: 0x14,
escape: 0x1b,
esc: 0x1b,
space: 0x20,
pageup: 0x21,
pagedown: 0x22,
end: 0x23,
home: 0x24,
left: 0x25,
up: 0x26,
right: 0x27,
down: 0x28,
insert: 0x2d,
delete: 0x2e,
win: 0x5b,
meta: 0x5b,
command: 0x5b,
cmd: 0x5b,
super: 0x5b,
numlock: 0x90,
scrolllock: 0x91,
printscreen: 0x2c,
f1: 0x70,
f2: 0x71,
f3: 0x72,
f4: 0x73,
f5: 0x74,
f6: 0x75,
f7: 0x76,
f8: 0x77,
f9: 0x78,
f10: 0x79,
f11: 0x7a,
f12: 0x7b,
}
export const MODIFIER_KEYS = new Set([
'shift',
'lshift',
'rshift',
'control',
'ctrl',
'lcontrol',
'rcontrol',
'alt',
'option',
'lalt',
'ralt',
'win',
'meta',
'command',
'cmd',
'super',
])

View File

@@ -5,6 +5,8 @@
* value setting, and hit-testing via PowerShell + System.Windows.Automation.
*/
import { ps } from './shared.js'
export interface UIElement {
name: string
controlType: string // Button, Edit, Text, List, Window, etc.
@@ -15,6 +17,48 @@ export interface UIElement {
children?: UIElement[]
}
const VALID_CONTROL_TYPES = new Set([
'Button',
'Calendar',
'CheckBox',
'ComboBox',
'Custom',
'DataGrid',
'DataItem',
'Document',
'Edit',
'Group',
'Header',
'HeaderItem',
'Hyperlink',
'Image',
'List',
'ListItem',
'Menu',
'MenuBar',
'MenuItem',
'Pane',
'ProgressBar',
'RadioButton',
'ScrollBar',
'Separator',
'Slider',
'Spinner',
'SplitButton',
'StatusBar',
'Tab',
'TabItem',
'Table',
'Text',
'Thumb',
'TitleBar',
'ToolBar',
'ToolTip',
'Tree',
'TreeItem',
'Window',
])
// ---------------------------------------------------------------------------
// Helper
// ---------------------------------------------------------------------------
@@ -25,15 +69,6 @@ Add-Type -AssemblyName UIAutomationTypes
Add-Type -AssemblyName WindowsBase
`
function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
function parseJsonSafe<T>(raw: string, fallback: T): T {
try {
if (!raw) return fallback
@@ -143,6 +178,9 @@ export function findElement(
)
}
if (query.controlType) {
if (!VALID_CONTROL_TYPES.has(query.controlType)) {
return null // Invalid control type
}
const v = query.controlType.replace(/'/g, "''")
conditions.push(
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`,
@@ -204,7 +242,10 @@ $obj | ConvertTo-Json -Compress
/**
* Click an element by its automationId using InvokePattern.
*/
export function clickElement(windowTitle: string, automationId: string): boolean {
export function clickElement(
windowTitle: string,
automationId: string,
): boolean {
const escapedTitle = windowTitle.replace(/'/g, "''")
const escapedId = automationId.replace(/'/g, "''")
@@ -237,7 +278,11 @@ try {
/**
* Set the value of an element by its automationId using ValuePattern.
*/
export function setValue(windowTitle: string, automationId: string, value: string): boolean {
export function setValue(
windowTitle: string,
automationId: string,
value: string,
): boolean {
const escapedTitle = windowTitle.replace(/'/g, "''")
const escapedId = automationId.replace(/'/g, "''")
const escapedValue = value.replace(/'/g, "''")

View File

@@ -0,0 +1,268 @@
/**
* Virtual Cursor — visible overlay cursor for the bound window.
*
* Shows a small colored cursor icon on top of the bound window,
* independent of the real mouse cursor. The user's real mouse
* stays free for their own use.
*
* The virtual cursor:
* - Moves when Computer Use calls click/moveMouse
* - Shows click animations (brief color flash)
* - Is click-through (WS_EX_TRANSPARENT) — doesn't intercept real mouse
* - Tracks the bound window position via the border tracker
* - Disappears when the window is unbound
*/
import * as fs from 'fs'
import * as path from 'path'
import { validateHwnd, getTmpDir } from './shared.js'
const CURSOR_SIZE = 20
const CURSOR_COLOR_R = 255
const CURSOR_COLOR_G = 50
const CURSOR_COLOR_B = 50
const CURSOR_OPACITY = 0.9
let cursorProc: ReturnType<typeof Bun.spawn> | null = null
let cursorStopFile: string | null = null
let cursorScriptFile: string | null = null
function buildCursorScript(hwnd: string, stopFile: string): string {
const stopFileEscaped = stopFile.replace(/\\/g, '\\\\')
return `
Add-Type -AssemblyName System.Windows.Forms
Add-Type -AssemblyName System.Drawing
Add-Type @'
using System;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Drawing2D;
public class VCursor {
[DllImport("user32.dll")]
public static extern bool IsWindow(IntPtr hWnd);
[DllImport("user32.dll", SetLastError = true)]
public static extern int SetWindowLong(IntPtr hWnd, int nIndex, int dwNewLong);
[DllImport("user32.dll", SetLastError = true)]
public static extern int GetWindowLong(IntPtr hWnd, int nIndex);
[DllImport("user32.dll")]
public static extern bool SetWindowPos(IntPtr hWnd, IntPtr hAfter, int X, int Y, int cx, int cy, uint f);
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr h, out RECT r);
[StructLayout(LayoutKind.Sequential)]
public struct RECT { public int L, T, R, B; }
public const int GWL_EXSTYLE = -20;
public const int WS_EX_LAYERED = 0x80000;
public const int WS_EX_TRANSPARENT = 0x20;
public const int WS_EX_TOOLWINDOW = 0x80;
public const int WS_EX_NOACTIVATE = 0x08000000;
public static readonly IntPtr HWND_TOPMOST = new IntPtr(-1);
public const uint SWP_NOACTIVATE = 0x0010;
public const uint SWP_SHOWWINDOW = 0x0040;
public const uint SWP_NOSIZE = 0x0001;
public static void MakeOverlay(IntPtr h) {
int ex = GetWindowLong(h, GWL_EXSTYLE);
ex |= WS_EX_LAYERED | WS_EX_TRANSPARENT | WS_EX_TOOLWINDOW | WS_EX_NOACTIVATE;
SetWindowLong(h, GWL_EXSTYLE, ex);
}
}
'@
$targetHwnd = [IntPtr]::new([long]${hwnd})
$stopFile = '${stopFileEscaped}'
$cursorSize = ${CURSOR_SIZE}
# Create cursor form with arrow shape
$cursor = New-Object System.Windows.Forms.Form
$cursor.FormBorderStyle = [System.Windows.Forms.FormBorderStyle]::None
$cursor.ShowInTaskbar = $false
$cursor.TopMost = $true
$cursor.StartPosition = [System.Windows.Forms.FormStartPosition]::Manual
$cursor.Size = New-Object System.Drawing.Size($cursorSize, $cursorSize)
$cursor.Location = New-Object System.Drawing.Point(-32000, -32000)
$cursor.Opacity = ${CURSOR_OPACITY}
$cursor.BackColor = [System.Drawing.Color]::Magenta
$cursor.TransparencyKey = [System.Drawing.Color]::Magenta
# Draw arrow cursor shape
$bmp = New-Object System.Drawing.Bitmap($cursorSize, $cursorSize)
$g = [System.Drawing.Graphics]::FromImage($bmp)
$g.SmoothingMode = [System.Drawing.Drawing2D.SmoothingMode]::AntiAlias
# Arrow polygon (pointing top-left)
$points = @(
(New-Object System.Drawing.Point(1, 1)),
(New-Object System.Drawing.Point(1, 16)),
(New-Object System.Drawing.Point(5, 12)),
(New-Object System.Drawing.Point(9, 18)),
(New-Object System.Drawing.Point(12, 16)),
(New-Object System.Drawing.Point(8, 10)),
(New-Object System.Drawing.Point(13, 10)),
(New-Object System.Drawing.Point(1, 1))
)
$brush = New-Object System.Drawing.SolidBrush([System.Drawing.Color]::FromArgb(${CURSOR_COLOR_R}, ${CURSOR_COLOR_G}, ${CURSOR_COLOR_B}))
$g.FillPolygon($brush, $points)
$pen = New-Object System.Drawing.Pen([System.Drawing.Color]::White, 1)
$g.DrawPolygon($pen, $points)
$g.Dispose()
$cursor.BackgroundImage = $bmp
$cursor.Show()
[VCursor]::MakeOverlay($cursor.Handle)
# Position file: the TS side writes "x,y" or "x,y,click" to this file
$posFile = $stopFile + '.pos'
$script:lastCX = -32000
$script:lastCY = -32000
$script:clickFlash = 0
$timer = New-Object System.Windows.Forms.Timer
$timer.Interval = 16 # ~60fps
$timer.Add_Tick({
if (-not [VCursor]::IsWindow($targetHwnd)) {
$timer.Stop(); $cursor.Close()
[System.Windows.Forms.Application]::ExitThread()
return
}
# Check stop
if (Test-Path $stopFile) {
$timer.Stop(); $cursor.Close()
try { Remove-Item $stopFile -ErrorAction SilentlyContinue } catch {}
try { Remove-Item $posFile -ErrorAction SilentlyContinue } catch {}
[System.Windows.Forms.Application]::ExitThread()
return
}
# Read position updates
if (Test-Path $posFile) {
try {
$data = Get-Content $posFile -Raw -ErrorAction SilentlyContinue
if ($data) {
$parts = $data.Trim().Split(',')
if ($parts.Length -ge 2) {
$script:lastCX = [int]$parts[0]
$script:lastCY = [int]$parts[1]
if ($parts.Length -ge 3 -and $parts[2] -eq 'click') {
$script:clickFlash = 6 # flash for 6 frames (~100ms)
}
}
Remove-Item $posFile -ErrorAction SilentlyContinue
}
} catch {}
}
# Get window position to convert client coords to screen coords
$wr = New-Object VCursor+RECT
[VCursor]::GetWindowRect($targetHwnd, [ref]$wr) | Out-Null
$screenX = $wr.L + $script:lastCX
$screenY = $wr.T + $script:lastCY
# Click flash: briefly change color
if ($script:clickFlash -gt 0) {
$cursor.Opacity = 1.0
$script:clickFlash--
if ($script:clickFlash -eq 0) {
$cursor.Opacity = ${CURSOR_OPACITY}
}
}
[VCursor]::SetWindowPos($cursor.Handle, [VCursor]::HWND_TOPMOST,
$screenX, $screenY, 0, 0,
[VCursor]::SWP_NOSIZE -bor [VCursor]::SWP_NOACTIVATE -bor [VCursor]::SWP_SHOWWINDOW) | Out-Null
$cursor.Visible = $true
})
$timer.Start()
[System.Windows.Forms.Application]::Run()
`
}
/**
* Start the virtual cursor overlay for a bound window.
*/
export function showVirtualCursor(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
hideVirtualCursor()
try {
const tmpDir = getTmpDir()
const ts = Date.now()
const stopFile = path.join(tmpDir, `cu_vcursor_stop_${ts}`)
const scriptFile = path.join(tmpDir, `cu_vcursor_${ts}.ps1`)
const script = buildCursorScript(hwnd, stopFile)
fs.writeFileSync(scriptFile, script, 'utf-8')
cursorProc = Bun.spawn(
[
'powershell',
'-NoProfile',
'-ExecutionPolicy',
'Bypass',
'-File',
scriptFile,
],
{ stdout: 'ignore', stderr: 'ignore' },
)
cursorStopFile = stopFile
cursorScriptFile = scriptFile
return true
} catch {
return false
}
}
/**
* Move the virtual cursor to client-area coordinates.
*/
export function moveVirtualCursor(
x: number,
y: number,
isClick: boolean = false,
): void {
if (!cursorStopFile) return
const posFile = cursorStopFile + '.pos'
try {
const data = isClick
? `${Math.round(x)},${Math.round(y)},click`
: `${Math.round(x)},${Math.round(y)}`
fs.writeFileSync(posFile, data, 'utf-8')
} catch {}
}
/**
* Hide and destroy the virtual cursor.
*/
export function hideVirtualCursor(): void {
if (cursorStopFile) {
try {
fs.writeFileSync(cursorStopFile, 'STOP', 'utf-8')
} catch {}
setTimeout(() => {
try {
cursorProc?.kill()
} catch {}
try {
if (cursorScriptFile) fs.unlinkSync(cursorScriptFile)
} catch {}
try {
if (cursorStopFile) fs.unlinkSync(cursorStopFile)
} catch {}
}, 2000)
}
cursorProc = null
cursorStopFile = null
cursorScriptFile = null
}
/**
* Check if virtual cursor is active.
*/
export function isVirtualCursorActive(): boolean {
return cursorProc !== null
}

View File

@@ -0,0 +1,66 @@
/**
* Visual indicator for bound windows — DWM native border color.
*
* Uses DwmSetWindowAttribute(DWMWA_BORDER_COLOR) to set a green border
* on the bound window. The border:
* - Is the window's OWN border, not an overlay — zero offset, zero shadow issues
* - Follows window movement/resize/rounded corners automatically (OS-level)
* - Persists across repaints, zero performance overhead
* - Works on Win11 22000+ (Build 22000 = Windows 11 GA)
*
* No overlays, no polling, no separate processes, no z-order issues.
*/
import { validateHwnd, ps } from './shared.js'
/**
* Set green border on bound window via DWM.
*/
export function markBound(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
// DWMWA_BORDER_COLOR = 34, COLORREF = 0x00BBGGRR
// Green: R=0, G=200, B=0 → 0x0000C800
const hr = ps(
`Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuDwm {
[DllImport("dwmapi.dll")]
public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size);
}
'@
$color = [uint32]0x0000C800
[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`,
)
return hr === '0'
}
/**
* Remove border, restore default.
*/
export function unmarkBound(hwnd: string): boolean {
hwnd = validateHwnd(hwnd)
// DWMWA_COLOR_DEFAULT = 0xFFFFFFFF
const hr = ps(
`Add-Type @'
using System;
using System.Runtime.InteropServices;
public class CuDwm {
[DllImport("dwmapi.dll")]
public static extern int DwmSetWindowAttribute(IntPtr hwnd, int attr, ref uint val, int size);
}
'@
$color = [uint32]0xFFFFFFFF
[CuDwm]::DwmSetWindowAttribute([IntPtr]::new([long]${hwnd}), 34, [ref]$color, 4)`,
)
return hr === '0'
}
/**
* Kill all borders — just reset all bound windows.
* With DWM approach, no processes to kill.
*/
export function cleanupAllBorders(): void {
// DWM border color is a window attribute — it resets automatically
// when the process exits or the window closes. No cleanup needed.
}

View File

@@ -4,7 +4,7 @@
*/
export interface WindowInfo {
hwnd: number
hwnd: string
pid: number
title: string
}
@@ -59,7 +59,13 @@ public class WinEnum {
*/
export function listWindows(): WindowInfo[] {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS],
cmd: [
'powershell',
'-NoProfile',
'-NonInteractive',
'-Command',
ENUM_WINDOWS_PS,
],
stdout: 'pipe',
stderr: 'pipe',
})
@@ -75,11 +81,11 @@ export function listWindows(): WindowInfo[] {
const secondPipe = trimmed.indexOf('|', firstPipe + 1)
if (firstPipe === -1 || secondPipe === -1) return null
const hwnd = Number(trimmed.slice(0, firstPipe))
const hwnd = trimmed.slice(0, firstPipe)
const pid = Number(trimmed.slice(firstPipe + 1, secondPipe))
const title = trimmed.slice(secondPipe + 1)
if (isNaN(hwnd) || isNaN(pid) || !title) return null
if (!hwnd || isNaN(pid) || !title) return null
return { hwnd, pid, title }
})
.filter((item): item is WindowInfo => item !== null)

View File

@@ -0,0 +1,696 @@
/**
* SendMessage-based input for Win32 windows.
*
* ALL text/keyboard operations target a specific HWND via SendMessageW.
* No SendInput / keybd_event / SendKeys — those are global and conflict with the user.
*
* Text input strategy:
* 1. Short text (≤ CLIPBOARD_THRESHOLD chars): SendMessageW(WM_CHAR) per codepoint
* 2. Long text (> threshold): Clipboard.SetText() + SendMessageW(Ctrl+V) paste
* Both paths support full Unicode (Chinese, emoji, etc.) without IME involvement.
*/
import { validateHwnd, runPs, VK_MAP, MODIFIER_KEYS } from './shared.js'
/** Character count above which we switch to clipboard paste */
const CLIPBOARD_THRESHOLD = 32
/** Cache findEditChild results — window structure doesn't change while bound */
const editChildCache = new Map<string, string | null>()
/** Clear cached edit-child mappings. Call on unbind. */
export function clearEditChildCache(hwnd?: string): void {
if (hwnd) {
editChildCache.delete(hwnd)
} else {
editChildCache.clear()
}
}
/**
* Resolve the HWND that should actually receive input messages.
* For WinUI 3 apps, returns the InputSite child window.
* For traditional Win32 apps, returns the edit control or the original HWND.
*/
export function resolveInputHwnd(hwnd: string): string {
hwnd = validateHwnd(hwnd)
return findEditChild(hwnd) ?? hwnd
}
const WINMSG_TYPE = `
Add-Type @'
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
public class WinMsg {
public delegate bool EnumChildProc(IntPtr hWnd, IntPtr lParam);
[DllImport("user32.dll")]
public static extern bool EnumChildWindows(IntPtr parent, EnumChildProc proc, IntPtr lParam);
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
public static extern int GetClassName(IntPtr h, StringBuilder sb, int max);
// CRITICAL: CharSet.Unicode → resolves to SendMessageW
// SendMessageW sends Unicode WM_CHAR (full UTF-16 codepoints including CJK)
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="PostMessageW")]
public static extern bool PostMessage(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
[DllImport("user32.dll")]
public static extern uint MapVirtualKeyW(uint uCode, uint uMapType);
public static IntPtr MakeLParam(int lo, int hi) {
return (IntPtr)((hi << 16) | (lo & 0xFFFF));
}
// Build lParam for WM_KEYDOWN / WM_KEYUP with correct scan code
// lParam bits: 0-15 repeat count, 16-23 scan code, 24 extended, 30 prev state, 31 transition
public static IntPtr KeyDownLParam(uint vk) {
uint scanCode = MapVirtualKeyW(vk, 0); // MAPVK_VK_TO_VSC = 0
return (IntPtr)(1 | (scanCode << 16)); // repeat=1, scanCode in bits 16-23
}
public static IntPtr KeyUpLParam(uint vk) {
uint scanCode = MapVirtualKeyW(vk, 0);
return (IntPtr)(1 | (scanCode << 16) | (1 << 30) | (1u << 31)); // prev=1, transition=1
}
public const uint WM_CHAR = 0x0102;
public const uint WM_KEYDOWN = 0x0100;
public const uint WM_KEYUP = 0x0101;
public const uint WM_LBUTTONDOWN = 0x0201;
public const uint WM_LBUTTONUP = 0x0202;
public const uint WM_RBUTTONDOWN = 0x0204;
public const uint WM_RBUTTONUP = 0x0205;
public static List<string> childResults = new List<string>();
public static void FindChildren(IntPtr parent) {
childResults.Clear();
EnumChildWindows(parent, delegate(IntPtr hWnd, IntPtr lParam) {
StringBuilder sb = new StringBuilder(256);
GetClassName(hWnd, sb, sb.Capacity);
childResults.Add(hWnd.ToInt64() + "|" + sb.ToString());
return true;
}, IntPtr.Zero);
}
}
'@
`
// Edit class names in priority order
const EDIT_CLASSES = [
'Windows.UI.Input.InputSite.WindowClass', // WinUI 3 input bridge (Windows Terminal, etc.)
'RichEditD2DPT', // Win11 Notepad (WinUI 3)
'RichEdit20W', // WordPad
'Edit', // Classic edit controls
'Scintilla', // Scintilla-based editors (Notepad++, etc.)
'Chrome_RenderWidgetHostHWND', // Chrome/Electron
'TextBox', // WPF TextBox
'RichTextBox', // WPF RichTextBox
'Windows.UI.Core.CoreWindow', // UWP CoreWindow (input target for some UWP apps)
]
/**
* Find the first edit-capable child window of a parent HWND.
*
* Strategy:
* 1. EnumChildWindows — search for known edit control class names
* 2. UI Automation fallback — find the first Edit/Document element and get its native HWND
*
* EnumChildWindows is recursive and enumerates all descendant windows,
* but for UWP apps the edit control may be in a different process (hosted
* inside ApplicationFrameHost). UI Automation crosses process boundaries.
*/
export function findEditChild(parentHwnd: string): string | null {
parentHwnd = validateHwnd(parentHwnd)
// Cache hit
if (editChildCache.has(parentHwnd)) {
return editChildCache.get(parentHwnd)!
}
// Strategy 1: EnumChildWindows (fast, works for Win32 apps)
const script = `${WINMSG_TYPE}
[WinMsg]::FindChildren([IntPtr]::new([long]${parentHwnd}))
[WinMsg]::childResults | ForEach-Object { $_ }
`
const raw = runPs(script)
if (raw) {
const children = raw
.split('\n')
.filter(Boolean)
.map(line => {
const trimmed = line.trim()
const pipe = trimmed.indexOf('|')
if (pipe === -1) return null
return {
hwnd: trimmed.slice(0, pipe),
className: trimmed.slice(pipe + 1),
}
})
.filter(
(item): item is { hwnd: string; className: string } => item !== null,
)
// Search in priority order
for (const editClass of EDIT_CLASSES) {
const match = children.find(c => c.className === editClass)
if (match) {
editChildCache.set(parentHwnd, match.hwnd)
return match.hwnd
}
}
}
// Strategy 2: UI Automation (crosses process boundaries, finds UWP edit controls)
const uiaScript = `
Add-Type -AssemblyName UIAutomationClient
Add-Type -AssemblyName UIAutomationTypes
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class UiaHelper {
[DllImport("user32.dll")]
public static extern bool IsWindow(IntPtr hWnd);
}
'@
try {
$el = [System.Windows.Automation.AutomationElement]::FromHandle([IntPtr]::new([long]${parentHwnd}))
if ($el -eq $null) { Write-Output 'NONE'; exit }
# Search for Edit or Document control types (covers text editors)
$editCond = [System.Windows.Automation.PropertyCondition]::new(
[System.Windows.Automation.AutomationElement]::ControlTypeProperty,
[System.Windows.Automation.ControlType]::Edit)
$docCond = [System.Windows.Automation.PropertyCondition]::new(
[System.Windows.Automation.AutomationElement]::ControlTypeProperty,
[System.Windows.Automation.ControlType]::Document)
$orCond = [System.Windows.Automation.OrCondition]::new($editCond, $docCond)
$found = $el.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $orCond)
if ($found -eq $null) { Write-Output 'NONE'; exit }
$nativeHwnd = $found.Current.NativeWindowHandle
if ($nativeHwnd -ne 0) {
Write-Output $nativeHwnd
} else {
Write-Output 'NONE'
}
} catch {
Write-Output 'NONE'
}
`
const uiaResult = runPs(uiaScript)
if (uiaResult && uiaResult !== 'NONE') {
const hwnd = uiaResult.trim()
if (hwnd && hwnd !== '0') {
editChildCache.set(parentHwnd, hwnd)
return hwnd
}
}
editChildCache.set(parentHwnd, null)
return null
}
/**
* Send a single Unicode character to a window via SendMessageW(WM_CHAR).
* Handles surrogate pairs for characters outside BMP (emoji, rare CJK, etc.).
*/
export function sendChar(hwnd: string, char: string): boolean {
hwnd = validateHwnd(hwnd)
const codePoint = char.codePointAt(0)
if (codePoint === undefined) return false
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
// BMP character (U+0000 to U+FFFF): single WM_CHAR
if (codePoint <= 0xffff) {
const script = `${WINMSG_TYPE}
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${codePoint}, [IntPtr]0)
`
return runPs(script) !== null
}
// Supplementary character (U+10000+): send as UTF-16 surrogate pair
// Windows processes surrogate pairs as two sequential WM_CHAR messages
const hi = Math.floor((codePoint - 0x10000) / 0x400) + 0xd800
const lo = ((codePoint - 0x10000) % 0x400) + 0xdc00
const script = `${WINMSG_TYPE}
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0)
[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0)
`
return runPs(script) !== null
}
/**
* Build PowerShell lines that send each codepoint via WM_CHAR.
* Handles surrogate pairs for supplementary characters.
*/
function buildWmCharLines(hwnd: string, text: string): string[] {
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const lines: string[] = []
for (const ch of text) {
const cp = ch.codePointAt(0)!
if (cp <= 0xffff) {
lines.push(
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${cp}, [IntPtr]0)`,
)
} else {
const hi = Math.floor((cp - 0x10000) / 0x400) + 0xd800
const lo = ((cp - 0x10000) % 0x400) + 0xdc00
lines.push(
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${hi}, [IntPtr]0)`,
)
lines.push(
`[WinMsg]::SendMessage(${hwndExpr}, [WinMsg]::WM_CHAR, [IntPtr]${lo}, [IntPtr]0)`,
)
}
}
return lines
}
/**
* Paste text via clipboard into the target window.
* Uses Clipboard.SetText() + SendMessageW(Ctrl+V).
* NO global APIs (SendInput/keybd_event/SendKeys) — only window-targeted messages.
*/
function pasteViaClipboard(hwnd: string, text: string): boolean {
// Escape single quotes for PowerShell string literal
const escaped = text.replace(/'/g, "''")
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const script = `${WINMSG_TYPE}
Add-Type -AssemblyName System.Windows.Forms
# Save current clipboard
$saved = $null
try { $saved = [System.Windows.Forms.Clipboard]::GetText() } catch {}
# Set our text
[System.Windows.Forms.Clipboard]::SetText('${escaped}')
# Ctrl+V via PostMessage to the target window (NOT global keybd_event)
# Must use PostMessage + correct lParam (scan code) for Windows Terminal / ConPTY
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x11, [WinMsg]::KeyDownLParam(0x11)) # Ctrl down
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]0x56, [WinMsg]::KeyDownLParam(0x56)) # V down
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x56, [WinMsg]::KeyUpLParam(0x56)) # V up
[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]0x11, [WinMsg]::KeyUpLParam(0x11)) # Ctrl up
# Brief wait for paste to complete
Start-Sleep -Milliseconds 50
# Restore clipboard
if ($saved -ne $null -and $saved -ne '') {
try { [System.Windows.Forms.Clipboard]::SetText($saved) } catch {}
} else {
try { [System.Windows.Forms.Clipboard]::Clear() } catch {}
}
Write-Output 'OK'
`
return runPs(script) === 'OK'
}
/**
* Send text to a window via WM_CHAR per Unicode codepoint.
* Always uses the WM_CHAR path — reliable across all window types including
* Windows Terminal / ConPTY where clipboard-based Ctrl+V doesn't work.
* Window-targeted, no global input APIs.
*/
export function sendText(hwnd: string, text: string): boolean {
const targetHwnd = resolveInputHwnd(hwnd)
const charLines = buildWmCharLines(targetHwnd, text)
const script = `${WINMSG_TYPE}
${charLines.join('\n')}
`
return runPs(script) !== null
}
/**
* Send a key down or key up event via PostMessageW(WM_KEYDOWN / WM_KEYUP).
* Uses PostMessage (async) instead of SendMessage — required for Windows Terminal
* and ConPTY-based console windows to correctly process key events.
* lParam includes the correct scan code via MapVirtualKeyW.
*/
export function sendKey(
hwnd: string,
vk: number,
action: 'down' | 'up',
): boolean {
hwnd = validateHwnd(hwnd)
const msg = action === 'down' ? '0x0100' : '0x0101'
const lParamFn = action === 'down' ? 'KeyDownLParam' : 'KeyUpLParam'
const script = `${WINMSG_TYPE}
[WinMsg]::PostMessage([IntPtr]::new([long]${hwnd}), ${msg}, [IntPtr]${vk}, [WinMsg]::${lParamFn}(${vk}))
`
return runPs(script) !== null
}
/**
* Send a key combination (e.g. ['ctrl', 'a']).
* Holds modifiers via WM_KEYDOWN, presses the key, then releases in reverse.
* All via SendMessageW — no global APIs.
*/
export function sendKeys(hwnd: string, combo: string[]): boolean {
hwnd = resolveInputHwnd(hwnd)
if (combo.length === 0) return false
const modifiers: number[] = []
let mainKey: number | undefined
for (const key of combo) {
const lower = key.toLowerCase()
const vk = VK_MAP[lower]
if (vk !== undefined) {
if (MODIFIER_KEYS.has(lower)) {
modifiers.push(vk)
} else {
mainKey = vk
}
} else if (lower.length === 1) {
// Single character — use its uppercase VK code
mainKey = lower.toUpperCase().charCodeAt(0)
} else {
return false
}
}
if (mainKey === undefined) return false
// Build script: modifiers down, key down, key up, modifiers up (reverse)
// Uses PostMessage (async) + correct lParam (scan code) — required for
// Windows Terminal / ConPTY to correctly translate key events.
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const lines: string[] = []
for (const mod of modifiers) {
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mod}, [WinMsg]::KeyDownLParam(${mod}))`,
)
}
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYDOWN, [IntPtr]${mainKey}, [WinMsg]::KeyDownLParam(${mainKey}))`,
)
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mainKey}, [WinMsg]::KeyUpLParam(${mainKey}))`,
)
for (const mod of [...modifiers].reverse()) {
lines.push(
`[WinMsg]::PostMessage(${hwndExpr}, [WinMsg]::WM_KEYUP, [IntPtr]${mod}, [WinMsg]::KeyUpLParam(${mod}))`,
)
}
const script = `${WINMSG_TYPE}
${lines.join('\n')}
`
return runPs(script) !== null
}
// ── Console Input Buffer (WriteConsoleInput) ─────────────────────────
// For terminal/console windows, SendMessageW doesn't reliably inject
// key events into the Console Input Buffer that raw-mode stdin reads.
// This function uses AttachConsole + WriteConsoleInput to inject directly.
const CONSOLE_INPUT_TYPE = `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class ConsoleInput {
[DllImport("kernel32.dll", SetLastError=true)]
public static extern bool AttachConsole(uint dwProcessId);
[DllImport("kernel32.dll", SetLastError=true)]
public static extern bool FreeConsole();
[DllImport("kernel32.dll", SetLastError=true)]
public static extern IntPtr GetStdHandle(int nStdHandle);
[DllImport("kernel32.dll", CharSet=CharSet.Unicode, SetLastError=true)]
public static extern bool WriteConsoleInput(
IntPtr hConsoleInput,
INPUT_RECORD[] lpBuffer,
uint nLength,
out uint lpNumberOfEventsWritten);
[DllImport("kernel32.dll")]
public static extern uint MapVirtualKeyW(uint uCode, uint uMapType);
[DllImport("user32.dll")]
public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint lpdwProcessId);
public const int STD_INPUT_HANDLE = -10;
[StructLayout(LayoutKind.Explicit)]
public struct INPUT_RECORD {
[FieldOffset(0)] public ushort EventType;
[FieldOffset(4)] public KEY_EVENT_RECORD KeyEvent;
}
[StructLayout(LayoutKind.Explicit, CharSet=CharSet.Unicode)]
public struct KEY_EVENT_RECORD {
[FieldOffset(0)] public bool bKeyDown;
[FieldOffset(4)] public ushort wRepeatCount;
[FieldOffset(6)] public ushort wVirtualKeyCode;
[FieldOffset(8)] public ushort wVirtualScanCode;
[FieldOffset(10)] public char UnicodeChar;
[FieldOffset(12)] public uint dwControlKeyState;
}
public static bool SendKeyToConsole(IntPtr hwnd, ushort vk, char ch) {
uint pid;
GetWindowThreadProcessId(hwnd, out pid);
if (pid == 0) return false;
FreeConsole();
if (!AttachConsole(pid)) return false;
try {
IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE);
if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false;
ushort scanCode = (ushort)MapVirtualKeyW(vk, 0);
INPUT_RECORD[] records = new INPUT_RECORD[2];
// Key down
records[0].EventType = 1; // KEY_EVENT
records[0].KeyEvent.bKeyDown = true;
records[0].KeyEvent.wRepeatCount = 1;
records[0].KeyEvent.wVirtualKeyCode = vk;
records[0].KeyEvent.wVirtualScanCode = scanCode;
records[0].KeyEvent.UnicodeChar = ch;
records[0].KeyEvent.dwControlKeyState = 0;
// Key up
records[1].EventType = 1;
records[1].KeyEvent.bKeyDown = false;
records[1].KeyEvent.wRepeatCount = 1;
records[1].KeyEvent.wVirtualKeyCode = vk;
records[1].KeyEvent.wVirtualScanCode = scanCode;
records[1].KeyEvent.UnicodeChar = ch;
records[1].KeyEvent.dwControlKeyState = 0;
uint written;
return WriteConsoleInput(hInput, records, 2, out written);
} finally {
FreeConsole();
}
}
public static bool SendTextToConsole(IntPtr hwnd, string text) {
uint pid;
GetWindowThreadProcessId(hwnd, out pid);
if (pid == 0) return false;
FreeConsole();
if (!AttachConsole(pid)) return false;
try {
IntPtr hInput = GetStdHandle(STD_INPUT_HANDLE);
if (hInput == IntPtr.Zero || hInput == (IntPtr)(-1)) return false;
INPUT_RECORD[] records = new INPUT_RECORD[text.Length * 2];
for (int i = 0; i < text.Length; i++) {
char c = text[i];
ushort vk = 0;
ushort sc = 0;
// Key down
records[i * 2].EventType = 1;
records[i * 2].KeyEvent.bKeyDown = true;
records[i * 2].KeyEvent.wRepeatCount = 1;
records[i * 2].KeyEvent.wVirtualKeyCode = vk;
records[i * 2].KeyEvent.wVirtualScanCode = sc;
records[i * 2].KeyEvent.UnicodeChar = c;
records[i * 2].KeyEvent.dwControlKeyState = 0;
// Key up
records[i * 2 + 1].EventType = 1;
records[i * 2 + 1].KeyEvent.bKeyDown = false;
records[i * 2 + 1].KeyEvent.wRepeatCount = 1;
records[i * 2 + 1].KeyEvent.wVirtualKeyCode = vk;
records[i * 2 + 1].KeyEvent.wVirtualScanCode = sc;
records[i * 2 + 1].KeyEvent.UnicodeChar = c;
records[i * 2 + 1].KeyEvent.dwControlKeyState = 0;
}
uint written;
return WriteConsoleInput(hInput, records, (uint)records.Length, out written);
} finally {
FreeConsole();
}
}
}
'@
`
/**
* Send a key to a console window via WriteConsoleInput (Console Input Buffer).
* This is required for terminal apps like Claude Code REPL that read stdin in raw mode.
*/
export function consoleKey(
hwnd: string,
vk: number,
ch: string = '\0',
): boolean {
hwnd = validateHwnd(hwnd)
const charCode = ch.charCodeAt(0)
const script = `${CONSOLE_INPUT_TYPE}
[ConsoleInput]::SendKeyToConsole([IntPtr]::new([long]${hwnd}), ${vk}, [char]${charCode})
`
return runPs(script) !== null
}
/**
* Send text + Enter to a console window via WriteConsoleInput.
* Directly injects into the Console Input Buffer — works for raw-mode stdin.
*/
export function consoleText(hwnd: string, text: string): boolean {
hwnd = validateHwnd(hwnd)
// Escape single quotes for PowerShell
const escaped = text.replace(/'/g, "''")
const script = `${CONSOLE_INPUT_TYPE}
[ConsoleInput]::SendTextToConsole([IntPtr]::new([long]${hwnd}), '${escaped}')
`
return runPs(script) !== null
}
/**
* Send a mouse click at client-area coordinates (x, y) relative to the window.
* Via SendMessageW — window-targeted, no cursor movement.
*/
export function sendClick(
hwnd: string,
x: number,
y: number,
button: 'left' | 'right',
): boolean {
hwnd = resolveInputHwnd(hwnd)
const downMsg = button === 'left' ? '0x0201' : '0x0204'
const upMsg = button === 'left' ? '0x0202' : '0x0205'
const hwndExpr = `[IntPtr]::new([long]${hwnd})`
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage(${hwndExpr}, ${downMsg}, [IntPtr]0, $lp)
[WinMsg]::SendMessage(${hwndExpr}, ${upMsg}, [IntPtr]0, $lp)
`
return runPs(script) !== null
}
/**
* Send a mouse-button-down at client-area coordinates (x, y).
* Via SendMessageW(WM_LBUTTONDOWN) — window-targeted, no cursor movement.
*/
export function sendMouseDown(hwnd: string, x: number, y: number): boolean {
hwnd = resolveInputHwnd(hwnd)
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONDOWN, [IntPtr]1, $lp)
`
return runPs(script) !== null
}
/**
* Send a mouse-button-up at client-area coordinates (x, y).
* Via SendMessageW(WM_LBUTTONUP) — window-targeted, no cursor movement.
*/
export function sendMouseUp(hwnd: string, x: number, y: number): boolean {
hwnd = resolveInputHwnd(hwnd)
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), [WinMsg]::WM_LBUTTONUP, [IntPtr]0, $lp)
`
return runPs(script) !== null
}
/**
* Send a WM_MOUSEMOVE at client-area coordinates (x, y).
* Used during drag operations. Via SendMessageW — window-targeted.
*/
export function sendMouseMove(hwnd: string, x: number, y: number): boolean {
hwnd = resolveInputHwnd(hwnd)
const script = `${WINMSG_TYPE}
$lp = [WinMsg]::MakeLParam(${x}, ${y})
[WinMsg]::SendMessage([IntPtr]::new([long]${hwnd}), 0x0200, [IntPtr]1, $lp)
`
return runPs(script) !== null
}
/**
* Send mouse wheel scroll at client-area coordinates (x, y).
* Via SendMessageW(WM_MOUSEWHEEL / WM_MOUSEHWHEEL).
*
* WM_MOUSEWHEEL: vertical scroll (positive delta = scroll up)
* WM_MOUSEHWHEEL: horizontal scroll (positive delta = scroll right)
*
* delta is in multiples of WHEEL_DELTA (120). One "click" = 120.
* lParam = screen coordinates (not client), wParam high word = delta.
*
* Works on Excel, browsers, modern UI — unlike WM_VSCROLL/WM_HSCROLL
* which only work on traditional scrollbar controls.
*/
export function sendMouseWheel(
hwnd: string,
x: number,
y: number,
delta: number,
horizontal: boolean = false,
): boolean {
hwnd = resolveInputHwnd(hwnd)
// WM_MOUSEWHEEL = 0x020A, WM_MOUSEHWHEEL = 0x020E
const msg = horizontal ? '0x020E' : '0x020A'
// wParam: high word = wheel delta (signed short), low word = modifier keys (0)
// delta is in units of WHEEL_DELTA (120). Positive = up/right, negative = down/left.
const wheelDelta = Math.round(delta) * 120
// Pack delta into high word of wParam: (delta << 16) as signed
// lParam: screen coordinates packed as MAKELPARAM(screenX, screenY)
const script = `${WINMSG_TYPE}
# WM_MOUSEWHEEL/WM_MOUSEHWHEEL require screen coords in lParam
# and wheel delta in high word of wParam
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WheelHelper {
[DllImport("user32.dll")] public static extern bool ClientToScreen(IntPtr hWnd, ref POINT p);
[StructLayout(LayoutKind.Sequential)] public struct POINT { public int X, Y; }
[DllImport("user32.dll", CharSet=CharSet.Unicode, EntryPoint="SendMessageW")]
public static extern IntPtr SendMsg(IntPtr hWnd, uint msg, IntPtr wParam, IntPtr lParam);
public static void Scroll(IntPtr hWnd, int clientX, int clientY, int delta, uint msg) {
POINT pt; pt.X = clientX; pt.Y = clientY;
ClientToScreen(hWnd, ref pt);
IntPtr wParam = (IntPtr)(delta << 16);
IntPtr lParam = (IntPtr)((pt.Y << 16) | (pt.X & 0xFFFF));
SendMsg(hWnd, msg, wParam, lParam);
}
}
'@
[WheelHelper]::Scroll([IntPtr]::new([long]${hwnd}), ${x}, ${y}, ${wheelDelta}, ${msg})
`
return runPs(script) !== null
}