Merge remote-tracking branch 'amDosion/feat/computer-use-windows'

This commit is contained in:
claude-code-best
2026-04-04 00:12:52 +08:00
39 changed files with 9446 additions and 752 deletions

View File

@@ -1602,7 +1602,7 @@ async function run(): Promise<CommanderCommand> {
// `type: 'stdio'`. An enterprise-config ant with the GB gate on would
// otherwise process.exit(1). Chrome has the same latent issue but has
// shipped without incident; chicago places itself correctly.
if (feature('CHICAGO_MCP') && getPlatform() === 'macos' && !getIsNonInteractiveSession()) {
if (feature('CHICAGO_MCP') && !getIsNonInteractiveSession()) {
try {
const {
getChicagoEnabled

View File

@@ -52,8 +52,8 @@ export function getTerminalBundleId(): string | null {
* takes this shape (no `hostBundleId`, no `teachMode`).
*/
export const CLI_CU_CAPABILITIES = {
screenshotFiltering: 'native' as const,
platform: 'darwin' as const,
screenshotFiltering: (process.platform === 'darwin' ? 'native' : 'none') as any,
platform: (process.platform === 'win32' ? 'windows' : process.platform === 'linux' ? 'linux' : 'darwin') as any,
}
export function isComputerUseMCPServer(name: string): boolean {

View File

@@ -59,6 +59,7 @@ export const releasePump = release
* concurrent drainRunLoop() calls share one setInterval.
*/
export async function drainRunLoop<T>(fn: () => Promise<T>): Promise<T> {
if (process.platform !== 'darwin') return fn()
retain()
let timer: ReturnType<typeof setTimeout> | undefined
try {

View File

@@ -23,6 +23,7 @@ import { requireComputerUseSwift } from './swiftLoader.js'
let registered = false
export function registerEscHotkey(onEscape: () => void): boolean {
if (process.platform !== 'darwin') return false
if (registered) return true
const cu = requireComputerUseSwift()
if (!(cu as any).hotkey.registerEscape(onEscape)) {

View File

@@ -68,6 +68,24 @@ function computeTargetDims(
}
async function readClipboardViaPbpaste(): Promise<string> {
if (process.platform === 'win32') {
const { stdout, code } = await execFileNoThrow('powershell', ['-NoProfile', '-Command', 'Get-Clipboard'], {
useCwd: false,
})
if (code !== 0) {
throw new Error(`PowerShell Get-Clipboard exited with code ${code}`)
}
return stdout
}
if (process.platform === 'linux') {
const { stdout, code } = await execFileNoThrow('xclip', ['-selection', 'clipboard', '-o'], {
useCwd: false,
})
if (code !== 0) {
throw new Error(`xclip exited with code ${code}`)
}
return stdout
}
const { stdout, code } = await execFileNoThrow('pbpaste', [], {
useCwd: false,
})
@@ -78,6 +96,25 @@ async function readClipboardViaPbpaste(): Promise<string> {
}
async function writeClipboardViaPbcopy(text: string): Promise<void> {
if (process.platform === 'win32') {
const { code } = await execFileNoThrow('powershell', ['-NoProfile', '-Command', `Set-Clipboard -Value '${text.replace(/'/g, "''")}'`], {
useCwd: false,
})
if (code !== 0) {
throw new Error(`PowerShell Set-Clipboard exited with code ${code}`)
}
return
}
if (process.platform === 'linux') {
const { code } = await execFileNoThrow('xclip', ['-selection', 'clipboard'], {
input: text,
useCwd: false,
})
if (code !== 0) {
throw new Error(`xclip exited with code ${code}`)
}
return
}
const { code } = await execFileNoThrow('pbcopy', [], {
input: text,
useCwd: false,
@@ -192,7 +229,7 @@ async function typeViaClipboard(input: Input, text: string): Promise<void> {
if ((await readClipboardViaPbpaste()) !== text) {
throw new Error('Clipboard write did not round-trip.')
}
await input.keys(['command', 'v'])
await input.keys([process.platform === 'darwin' ? 'command' : 'ctrl', 'v'])
await sleep(100)
} finally {
if (typeof saved === 'string') {
@@ -260,9 +297,9 @@ export function createCliExecutor(opts: {
getMouseAnimationEnabled: () => boolean
getHideBeforeActionEnabled: () => boolean
}): ComputerExecutor {
if (process.platform !== 'darwin') {
if (process.platform !== 'darwin' && process.platform !== 'win32' && process.platform !== 'linux') {
throw new Error(
`createCliExecutor called on ${process.platform}. Computer control is macOS-only.`,
`createCliExecutor called on ${process.platform}. Computer control requires macOS, Windows, or Linux.`,
)
}
@@ -377,7 +414,7 @@ export function createCliExecutor(opts: {
d.height,
d.scaleFactor,
)
return drainRunLoop(() =>
const raw = await drainRunLoop(() =>
cu.resolvePrepareCapture(
withoutTerminal(opts.allowedBundleIds),
surrogateHost,
@@ -389,6 +426,14 @@ export function createCliExecutor(opts: {
opts.doHide,
),
)
// Ensure the result has fields expected by toolCalls.ts (hidden, displayId).
// macOS native returns these from Swift; our cross-platform ComputerUseAPI
// returns {base64, width, height} — fill in the missing fields.
return {
...raw,
hidden: (raw as any).hidden ?? [],
displayId: (raw as any).displayId ?? opts.preferredDisplayId ?? d.displayId,
}
},
/**

View File

@@ -10,7 +10,7 @@ type ChicagoConfig = CuSubGates & {
}
const DEFAULTS: ChicagoConfig = {
enabled: false,
enabled: true,
pixelValidation: false,
clipboardPasteMultiline: true,
mouseAnimation: true,
@@ -37,9 +37,7 @@ function readConfig(): ChicagoConfig {
// regardless of subscription tier — not all ants are max/pro, and per
// CLAUDE.md:281, USER_TYPE !== 'ant' branches get zero antfooding.
function hasRequiredSubscription(): boolean {
if (process.env.USER_TYPE === 'ant') return true
const tier = getSubscriptionType()
return tier === 'max' || tier === 'pro'
return true
}
export function getChicagoEnabled(): boolean {

View File

@@ -45,6 +45,7 @@ export function getComputerUseHostAdapter(): ComputerUseHostAdapter {
getHideBeforeActionEnabled: () => getChicagoSubGates().hideBeforeAction,
}),
ensureOsPermissions: async () => {
if (process.platform !== 'darwin') return { granted: true }
const cu = requireComputerUseSwift()
const accessibility = (cu as any).tcc.checkAccessibility()
const screenRecording = (cu as any).tcc.checkScreenRecording()

View File

@@ -13,11 +13,17 @@ let cached: ComputerUseAPI | undefined
* these in drainRunLoop().
*/
export function requireComputerUseSwift(): ComputerUseAPI {
if (process.platform !== 'darwin') {
throw new Error('@ant/computer-use-swift is macOS-only')
}
if (cached) return cached
// eslint-disable-next-line @typescript-eslint/no-require-imports
return (cached ??= require('@ant/computer-use-swift') as ComputerUseAPI)
const mod = require('@ant/computer-use-swift')
// macOS native .node exports a plain object with apps/display/screenshot directly.
// Our cross-platform package exports { ComputerUseAPI } class — needs instantiation.
if (mod.ComputerUseAPI && typeof mod.ComputerUseAPI === 'function') {
cached = new mod.ComputerUseAPI() as ComputerUseAPI
} else {
cached = mod as ComputerUseAPI
}
return cached
}
export type { ComputerUseAPI }

View File

@@ -0,0 +1,257 @@
/**
* OCR module using Windows.Media.Ocr.OcrEngine via PowerShell.
* Captures a screen region or window, then runs WinRT OCR to extract text.
*/
export interface OcrLine {
text: string
bounds: { x: number; y: number; w: number; h: number }
}
export interface OcrResult {
text: string
lines: OcrLine[]
language: string
}
function emptyResult(language: string): OcrResult {
return { text: '', lines: [], language }
}
function runPs(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
/**
* PowerShell script that:
* 1. Screenshots a screen region using CopyFromScreen
* 2. Saves to temp PNG
* 3. Loads via WinRT BitmapDecoder -> SoftwareBitmap
* 4. Runs OcrEngine.RecognizeAsync
* 5. Outputs JSON with text, lines, and bounding rects
*/
function buildOcrRegionScript(
x: number,
y: number,
w: number,
h: number,
lang: string,
): string {
return `
Add-Type -AssemblyName System.Drawing
Add-Type -AssemblyName System.Runtime.WindowsRuntime
# Load WinRT types
$null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Graphics.Imaging.SoftwareBitmap, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Graphics.Imaging.BitmapDecoder, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Storage.StorageFile, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Storage.Streams.RandomAccessStream, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Globalization.Language, Windows.Foundation, ContentType = WindowsRuntime]
# Await helper for WinRT async operations
$asTaskGeneric = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and
$_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation\`1'
})[0]
Function Await($WinRtTask, $ResultType) {
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
$netTask.Result
}
try {
# Step 1: Screenshot region
$bmp = New-Object System.Drawing.Bitmap(${w}, ${h})
$g = [System.Drawing.Graphics]::FromImage($bmp)
$g.CopyFromScreen(${x}, ${y}, 0, 0, (New-Object System.Drawing.Size(${w}, ${h})))
$g.Dispose()
# Step 2: Save to temp file
$tmpFile = [System.IO.Path]::Combine([System.IO.Path]::GetTempPath(), "ocrtemp_$([guid]::NewGuid().ToString('N')).png")
$bmp.Save($tmpFile, [System.Drawing.Imaging.ImageFormat]::Png)
$bmp.Dispose()
# Step 3: Open as StorageFile -> BitmapDecoder -> SoftwareBitmap
$storageFile = Await ([Windows.Storage.StorageFile]::GetFileFromPathAsync($tmpFile)) ([Windows.Storage.StorageFile])
$stream = Await ($storageFile.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
$decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
$softwareBmp = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
# Step 4: Create OCR engine
$ocrLang = New-Object Windows.Globalization.Language('${lang}')
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
if ($engine -eq $null) {
# Fallback to en-US
$ocrLang = New-Object Windows.Globalization.Language('en-US')
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
}
if ($engine -eq $null) {
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
return
}
# Step 5: Run OCR
$ocrResult = Await ($engine.RecognizeAsync($softwareBmp)) ([Windows.Media.Ocr.OcrResult])
# Step 6: Extract lines with bounding rects
$lines = @()
foreach ($line in $ocrResult.Lines) {
$minX = [double]::MaxValue; $minY = [double]::MaxValue
$maxX = 0.0; $maxY = 0.0
foreach ($word in $line.Words) {
$r = $word.BoundingRect
if ($r.X -lt $minX) { $minX = $r.X }
if ($r.Y -lt $minY) { $minY = $r.Y }
if (($r.X + $r.Width) -gt $maxX) { $maxX = $r.X + $r.Width }
if (($r.Y + $r.Height) -gt $maxY) { $maxY = $r.Y + $r.Height }
}
$lines += @{
text = $line.Text
bounds = @{
x = [int]$minX
y = [int]$minY
w = [int]($maxX - $minX)
h = [int]($maxY - $minY)
}
}
}
$output = @{
text = $ocrResult.Text
lines = $lines
language = $ocrLang.LanguageTag
}
Write-Output (ConvertTo-Json $output -Depth 4 -Compress)
# Cleanup
$stream.Dispose()
Remove-Item $tmpFile -ErrorAction SilentlyContinue
} catch {
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
}
`
}
/**
* PowerShell script to get a window's bounding rect by title.
*/
function buildGetWindowRectScript(windowTitle: string): string {
const escaped = windowTitle.replace(/'/g, "''")
return `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WinRect {
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
public static extern IntPtr FindWindow(string c, string t);
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr h, out RECT r);
[StructLayout(LayoutKind.Sequential)]
public struct RECT { public int L, T, R, B; }
public static string Get(string title) {
IntPtr hwnd = FindWindow(null, title);
if (hwnd == IntPtr.Zero) return "NOT_FOUND";
RECT r; GetWindowRect(hwnd, out r);
int w = r.R - r.L; int h = r.B - r.T;
if (w <= 0 || h <= 0) return "INVALID_SIZE";
return r.L + "," + r.T + "," + w + "," + h;
}
}
'@
[WinRect]::Get('${escaped}')
`
}
function parseOcrOutput(raw: string, lang: string): OcrResult {
if (!raw) return emptyResult(lang)
try {
const parsed = JSON.parse(raw)
return {
text: parsed.text ?? '',
lines: Array.isArray(parsed.lines)
? parsed.lines.map((l: any) => ({
text: l.text ?? '',
bounds: {
x: l.bounds?.x ?? 0,
y: l.bounds?.y ?? 0,
w: l.bounds?.w ?? 0,
h: l.bounds?.h ?? 0,
},
}))
: [],
language: parsed.language ?? lang,
}
} catch {
return emptyResult(lang)
}
}
/**
* Perform OCR on a screen region.
* Screenshots the specified rectangle, then runs WinRT OcrEngine.
*
* @param x - Left coordinate
* @param y - Top coordinate
* @param w - Width in pixels
* @param h - Height in pixels
* @param lang - BCP-47 language tag (default 'en-US'). Confirmed: 'en-US', 'zh-Hans-CN'
*/
export async function ocrRegion(
x: number,
y: number,
w: number,
h: number,
lang?: string,
): Promise<OcrResult> {
const language = lang ?? 'en-US'
if (w <= 0 || h <= 0) return emptyResult(language)
try {
const script = buildOcrRegionScript(x, y, w, h, language)
const raw = runPs(script)
return parseOcrOutput(raw, language)
} catch {
return emptyResult(language)
}
}
/**
* Perform OCR on a specific window by its title.
* Gets the window rect, then delegates to ocrRegion.
*
* @param windowTitle - Exact window title to find via FindWindow
* @param lang - BCP-47 language tag (default 'en-US')
*/
export async function ocrWindow(
windowTitle: string,
lang?: string,
): Promise<OcrResult> {
const language = lang ?? 'en-US'
try {
const rectScript = buildGetWindowRectScript(windowTitle)
const raw = runPs(rectScript)
const trimmed = raw.trim()
if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') {
return emptyResult(language)
}
const parts = trimmed.split(',')
if (parts.length !== 4) return emptyResult(language)
const [x, y, w, h] = parts.map(Number)
if (!w || !h) return emptyResult(language)
return ocrRegion(x, y, w, h, lang)
} catch {
return emptyResult(language)
}
}

View File

@@ -0,0 +1,308 @@
/**
* Windows UI Automation module
*
* Provides UI element tree inspection, element lookup, programmatic click,
* value setting, and hit-testing via PowerShell + System.Windows.Automation.
*/
export interface UIElement {
name: string
controlType: string // Button, Edit, Text, List, Window, etc.
automationId: string
boundingRect: { x: number; y: number; w: number; h: number }
isEnabled: boolean
value?: string
children?: UIElement[]
}
// ---------------------------------------------------------------------------
// Helper
// ---------------------------------------------------------------------------
const UIA_ASSEMBLIES = `
Add-Type -AssemblyName UIAutomationClient
Add-Type -AssemblyName UIAutomationTypes
Add-Type -AssemblyName WindowsBase
`
function ps(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
function parseJsonSafe<T>(raw: string, fallback: T): T {
try {
if (!raw) return fallback
return JSON.parse(raw) as T
} catch {
return fallback
}
}
// PowerShell snippet that finds a window by exact or partial title match.
// Assumes $title is already set in the calling script.
const PS_FIND_WINDOW = `
$root = [System.Windows.Automation.AutomationElement]::RootElement
$window = $root.FindFirst(
[System.Windows.Automation.TreeScope]::Children,
[System.Windows.Automation.PropertyCondition]::new(
[System.Windows.Automation.AutomationElement]::NameProperty, $title))
if ($window -eq $null) {
$all = $root.FindAll(
[System.Windows.Automation.TreeScope]::Children,
[System.Windows.Automation.Condition]::TrueCondition)
foreach ($el in $all) {
if ($el.Current.Name -and $el.Current.Name.Contains($title)) {
$window = $el
break
}
}
}
`
// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------
/**
* Get the UI element tree of a window, up to `depth` levels deep (default 3).
*/
export function getUITree(windowTitle: string, depth: number = 3): UIElement[] {
const escapedTitle = windowTitle.replace(/'/g, "''")
const script = `
${UIA_ASSEMBLIES}
$title = '${escapedTitle}'
${PS_FIND_WINDOW}
if ($window -eq $null) {
Write-Output '[]'
exit
}
function Get-UIChildren($parent, $currentDepth, $maxDepth) {
if ($currentDepth -ge $maxDepth) { return @() }
$children = $parent.FindAll(
[System.Windows.Automation.TreeScope]::Children,
[System.Windows.Automation.Condition]::TrueCondition)
$result = @()
foreach ($el in $children) {
$rect = $el.Current.BoundingRectangle
$obj = @{
name = [string]$el.Current.Name
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
automationId = [string]$el.Current.AutomationId
boundingRect = @{
x = [int]$rect.X
y = [int]$rect.Y
w = [int]$rect.Width
h = [int]$rect.Height
}
isEnabled = $el.Current.IsEnabled
}
try {
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
} catch {}
$sub = Get-UIChildren $el ($currentDepth + 1) $maxDepth
if ($sub.Count -gt 0) { $obj['children'] = $sub }
$result += $obj
}
return $result
}
$tree = Get-UIChildren $window 0 ${depth}
if ($tree -eq $null -or $tree.Count -eq 0) {
Write-Output '[]'
} else {
$tree | ConvertTo-Json -Depth 20 -Compress
}
`
const raw = ps(script)
const parsed = parseJsonSafe<UIElement | UIElement[]>(raw, [])
return Array.isArray(parsed) ? parsed : [parsed]
}
/**
* Find a single element inside a window matching the given query fields.
*/
export function findElement(
windowTitle: string,
query: { name?: string; controlType?: string; automationId?: string },
): UIElement | null {
const escapedTitle = windowTitle.replace(/'/g, "''")
// Build conditions array
const conditions: string[] = []
if (query.name) {
const v = query.name.replace(/'/g, "''")
conditions.push(
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::NameProperty, '${v}')`,
)
}
if (query.controlType) {
const v = query.controlType.replace(/'/g, "''")
conditions.push(
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`,
)
}
if (query.automationId) {
const v = query.automationId.replace(/'/g, "''")
conditions.push(
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${v}')`,
)
}
if (conditions.length === 0) return null
let conditionExpr: string
if (conditions.length === 1) {
conditionExpr = conditions[0]
} else {
conditionExpr = `[System.Windows.Automation.AndCondition]::new(@(${conditions.join(', ')}))`
}
const script = `
${UIA_ASSEMBLIES}
$title = '${escapedTitle}'
${PS_FIND_WINDOW}
if ($window -eq $null) {
Write-Output 'null'
exit
}
$cond = ${conditionExpr}
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
if ($el -eq $null) {
Write-Output 'null'
exit
}
$rect = $el.Current.BoundingRectangle
$obj = @{
name = [string]$el.Current.Name
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
automationId = [string]$el.Current.AutomationId
boundingRect = @{
x = [int]$rect.X
y = [int]$rect.Y
w = [int]$rect.Width
h = [int]$rect.Height
}
isEnabled = $el.Current.IsEnabled
}
try {
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
} catch {}
$obj | ConvertTo-Json -Compress
`
const raw = ps(script)
return parseJsonSafe<UIElement | null>(raw, null)
}
/**
* Click an element by its automationId using InvokePattern.
*/
export function clickElement(windowTitle: string, automationId: string): boolean {
const escapedTitle = windowTitle.replace(/'/g, "''")
const escapedId = automationId.replace(/'/g, "''")
const script = `
${UIA_ASSEMBLIES}
$title = '${escapedTitle}'
${PS_FIND_WINDOW}
if ($window -eq $null) {
Write-Output 'false'
exit
}
$cond = [System.Windows.Automation.PropertyCondition]::new(
[System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}')
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
if ($el -eq $null) {
Write-Output 'false'
exit
}
try {
$ip = $el.GetCurrentPattern([System.Windows.Automation.InvokePattern]::Pattern)
$ip.Invoke()
Write-Output 'true'
} catch {
Write-Output 'false'
}
`
return ps(script) === 'true'
}
/**
* Set the value of an element by its automationId using ValuePattern.
*/
export function setValue(windowTitle: string, automationId: string, value: string): boolean {
const escapedTitle = windowTitle.replace(/'/g, "''")
const escapedId = automationId.replace(/'/g, "''")
const escapedValue = value.replace(/'/g, "''")
const script = `
${UIA_ASSEMBLIES}
$title = '${escapedTitle}'
${PS_FIND_WINDOW}
if ($window -eq $null) {
Write-Output 'false'
exit
}
$cond = [System.Windows.Automation.PropertyCondition]::new(
[System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}')
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
if ($el -eq $null) {
Write-Output 'false'
exit
}
try {
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
$vp.SetValue('${escapedValue}')
Write-Output 'true'
} catch {
Write-Output 'false'
}
`
return ps(script) === 'true'
}
/**
* Get the UI element at a specific screen coordinate.
*/
export function elementAtPoint(x: number, y: number): UIElement | null {
const script = `
${UIA_ASSEMBLIES}
try {
$point = [System.Windows.Point]::new(${x}, ${y})
$el = [System.Windows.Automation.AutomationElement]::FromPoint($point)
if ($el -eq $null) {
Write-Output 'null'
exit
}
$rect = $el.Current.BoundingRectangle
$obj = @{
name = [string]$el.Current.Name
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
automationId = [string]$el.Current.AutomationId
boundingRect = @{
x = [int]$rect.X
y = [int]$rect.Y
w = [int]$rect.Width
h = [int]$rect.Height
}
isEnabled = $el.Current.IsEnabled
}
try {
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
} catch {}
$obj | ConvertTo-Json -Compress
} catch {
Write-Output 'null'
}
`
const raw = ps(script)
return parseJsonSafe<UIElement | null>(raw, null)
}

View File

@@ -0,0 +1,129 @@
/**
* Window-level screenshot capture using Win32 PrintWindow API.
* Captures windows even when occluded or minimized.
*/
interface CaptureResult {
base64: string
width: number
height: number
}
const CAPTURE_BY_TITLE_PS = `
Add-Type -AssemblyName System.Drawing
Add-Type -ReferencedAssemblies System.Drawing @'
using System;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;
public class WinCap {
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
public static extern IntPtr FindWindow(string c, string t);
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr h, out RECT r);
[DllImport("user32.dll")]
public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f);
[StructLayout(LayoutKind.Sequential)]
public struct RECT { public int L, T, R, B; }
public static string Capture(string title) {
IntPtr hwnd = FindWindow(null, title);
if (hwnd == IntPtr.Zero) return "NOT_FOUND";
RECT r; GetWindowRect(hwnd, out r);
int w = r.R - r.L; int h = r.B - r.T;
if (w <= 0 || h <= 0) return "INVALID_SIZE";
Bitmap bmp = new Bitmap(w, h);
Graphics g = Graphics.FromImage(bmp);
IntPtr hdc = g.GetHdc();
PrintWindow(hwnd, hdc, 2);
g.ReleaseHdc(hdc); g.Dispose();
var ms = new System.IO.MemoryStream();
bmp.Save(ms, ImageFormat.Png);
bmp.Dispose();
return w + "," + h + "," + Convert.ToBase64String(ms.ToArray());
}
}
'@
`
const CAPTURE_BY_HWND_PS = `
Add-Type -AssemblyName System.Drawing
Add-Type -ReferencedAssemblies System.Drawing @'
using System;
using System.Runtime.InteropServices;
using System.Drawing;
using System.Drawing.Imaging;
public class WinCapH {
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr h, out RECT r);
[DllImport("user32.dll")]
public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f);
[DllImport("user32.dll")]
public static extern bool IsWindow(IntPtr hWnd);
[StructLayout(LayoutKind.Sequential)]
public struct RECT { public int L, T, R, B; }
public static string Capture(IntPtr hwnd) {
if (!IsWindow(hwnd)) return "NOT_FOUND";
RECT r; GetWindowRect(hwnd, out r);
int w = r.R - r.L; int h = r.B - r.T;
if (w <= 0 || h <= 0) return "INVALID_SIZE";
Bitmap bmp = new Bitmap(w, h);
Graphics g = Graphics.FromImage(bmp);
IntPtr hdc = g.GetHdc();
PrintWindow(hwnd, hdc, 2);
g.ReleaseHdc(hdc); g.Dispose();
var ms = new System.IO.MemoryStream();
bmp.Save(ms, ImageFormat.Png);
bmp.Dispose();
return w + "," + h + "," + Convert.ToBase64String(ms.ToArray());
}
}
'@
`
function parseCaptureOutput(raw: string): CaptureResult | null {
const trimmed = raw.trim()
if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') {
return null
}
const firstComma = trimmed.indexOf(',')
const secondComma = trimmed.indexOf(',', firstComma + 1)
if (firstComma === -1 || secondComma === -1) return null
const width = Number(trimmed.slice(0, firstComma))
const height = Number(trimmed.slice(firstComma + 1, secondComma))
const base64 = trimmed.slice(secondComma + 1)
if (!width || !height || !base64) return null
return { base64, width, height }
}
function runPs(script: string): string {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
stdout: 'pipe',
stderr: 'pipe',
})
return new TextDecoder().decode(result.stdout).trim()
}
/**
* Capture a window screenshot by its exact title.
* Uses PrintWindow which works even for occluded/background windows.
*/
export function captureWindow(title: string): CaptureResult | null {
const escaped = title.replace(/'/g, "''")
const script = `${CAPTURE_BY_TITLE_PS}\n[WinCap]::Capture('${escaped}')`
const raw = runPs(script)
return parseCaptureOutput(raw)
}
/**
* Capture a window screenshot by its HWND handle.
*/
export function captureWindowByHwnd(hwnd: number): CaptureResult | null {
const script = `${CAPTURE_BY_HWND_PS}\n[WinCapH]::Capture([IntPtr]::new(${hwnd}))`
const raw = runPs(script)
return parseCaptureOutput(raw)
}

View File

@@ -0,0 +1,86 @@
/**
* Window enumeration using Win32 EnumWindows API.
* Returns visible windows with their HWND, PID, and title.
*/
export interface WindowInfo {
hwnd: number
pid: number
title: string
}
const ENUM_WINDOWS_PS = `
Add-Type @'
using System;
using System.Collections.Generic;
using System.Runtime.InteropServices;
using System.Text;
public class WinEnum {
public delegate bool EnumWindowsProc(IntPtr hWnd, IntPtr lParam);
[DllImport("user32.dll")]
public static extern bool EnumWindows(EnumWindowsProc lpEnumFunc, IntPtr lParam);
[DllImport("user32.dll")]
public static extern bool IsWindowVisible(IntPtr hWnd);
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
public static extern int GetWindowText(IntPtr hWnd, StringBuilder lpString, int nMaxCount);
[DllImport("user32.dll")]
public static extern int GetWindowTextLength(IntPtr hWnd);
[DllImport("user32.dll")]
public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint processId);
public static List<string> results = new List<string>();
public static void Run() {
results.Clear();
EnumWindows(delegate(IntPtr hWnd, IntPtr lParam) {
if (!IsWindowVisible(hWnd)) return true;
int len = GetWindowTextLength(hWnd);
if (len == 0) return true;
StringBuilder sb = new StringBuilder(len + 1);
GetWindowText(hWnd, sb, sb.Capacity);
string title = sb.ToString();
if (string.IsNullOrWhiteSpace(title)) return true;
uint pid = 0;
GetWindowThreadProcessId(hWnd, out pid);
results.Add(hWnd.ToInt64() + "|" + pid + "|" + title);
return true;
}, IntPtr.Zero);
}
}
'@
[WinEnum]::Run()
[WinEnum]::results | ForEach-Object { $_ }
`
/**
* List all visible windows with non-empty titles.
* Returns HWND, PID, and window title for each.
*/
export function listWindows(): WindowInfo[] {
const result = Bun.spawnSync({
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS],
stdout: 'pipe',
stderr: 'pipe',
})
const raw = new TextDecoder().decode(result.stdout).trim()
if (!raw) return []
return raw
.split('\n')
.filter(Boolean)
.map(line => {
const trimmed = line.trim()
const firstPipe = trimmed.indexOf('|')
const secondPipe = trimmed.indexOf('|', firstPipe + 1)
if (firstPipe === -1 || secondPipe === -1) return null
const hwnd = Number(trimmed.slice(0, firstPipe))
const pid = Number(trimmed.slice(firstPipe + 1, secondPipe))
const title = trimmed.slice(secondPipe + 1)
if (isNaN(hwnd) || isNaN(pid) || !title) return null
return { hwnd, pid, title }
})
.filter((item): item is WindowInfo => item !== null)
}