mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-17 22:05:50 +00:00
feat: Windows Computer Use enhancement — PrintWindow, UI Automation, OCR
New Windows-native capabilities: - windowCapture.ts: PrintWindow API for per-window screenshot (works on occluded/background windows) - windowEnum.ts: EnumWindows for precise window enumeration with HWND - uiAutomation.ts: IUIAutomation for UI tree reading, element clicking, text input, and coordinate-based element identification - ocr.ts: Windows.Media.Ocr for screen text recognition (en-US + zh-CN) Updated win32.ts backend to use EnumWindows for listRunning() and added captureWindowTarget() for window-specific screenshots. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
257
src/utils/computerUse/win32/ocr.ts
Normal file
257
src/utils/computerUse/win32/ocr.ts
Normal file
@@ -0,0 +1,257 @@
|
||||
/**
|
||||
* OCR module using Windows.Media.Ocr.OcrEngine via PowerShell.
|
||||
* Captures a screen region or window, then runs WinRT OCR to extract text.
|
||||
*/
|
||||
|
||||
export interface OcrLine {
|
||||
text: string
|
||||
bounds: { x: number; y: number; w: number; h: number }
|
||||
}
|
||||
|
||||
export interface OcrResult {
|
||||
text: string
|
||||
lines: OcrLine[]
|
||||
language: string
|
||||
}
|
||||
|
||||
function emptyResult(language: string): OcrResult {
|
||||
return { text: '', lines: [], language }
|
||||
}
|
||||
|
||||
function runPs(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* PowerShell script that:
|
||||
* 1. Screenshots a screen region using CopyFromScreen
|
||||
* 2. Saves to temp PNG
|
||||
* 3. Loads via WinRT BitmapDecoder -> SoftwareBitmap
|
||||
* 4. Runs OcrEngine.RecognizeAsync
|
||||
* 5. Outputs JSON with text, lines, and bounding rects
|
||||
*/
|
||||
function buildOcrRegionScript(
|
||||
x: number,
|
||||
y: number,
|
||||
w: number,
|
||||
h: number,
|
||||
lang: string,
|
||||
): string {
|
||||
return `
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
Add-Type -AssemblyName System.Runtime.WindowsRuntime
|
||||
|
||||
# Load WinRT types
|
||||
$null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Graphics.Imaging.SoftwareBitmap, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Graphics.Imaging.BitmapDecoder, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Storage.StorageFile, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Storage.Streams.RandomAccessStream, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
$null = [Windows.Globalization.Language, Windows.Foundation, ContentType = WindowsRuntime]
|
||||
|
||||
# Await helper for WinRT async operations
|
||||
$asTaskGeneric = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
|
||||
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and
|
||||
$_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation\`1'
|
||||
})[0]
|
||||
Function Await($WinRtTask, $ResultType) {
|
||||
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
|
||||
$netTask = $asTask.Invoke($null, @($WinRtTask))
|
||||
$netTask.Wait(-1) | Out-Null
|
||||
$netTask.Result
|
||||
}
|
||||
|
||||
try {
|
||||
# Step 1: Screenshot region
|
||||
$bmp = New-Object System.Drawing.Bitmap(${w}, ${h})
|
||||
$g = [System.Drawing.Graphics]::FromImage($bmp)
|
||||
$g.CopyFromScreen(${x}, ${y}, 0, 0, (New-Object System.Drawing.Size(${w}, ${h})))
|
||||
$g.Dispose()
|
||||
|
||||
# Step 2: Save to temp file
|
||||
$tmpFile = [System.IO.Path]::Combine([System.IO.Path]::GetTempPath(), "ocrtemp_$([guid]::NewGuid().ToString('N')).png")
|
||||
$bmp.Save($tmpFile, [System.Drawing.Imaging.ImageFormat]::Png)
|
||||
$bmp.Dispose()
|
||||
|
||||
# Step 3: Open as StorageFile -> BitmapDecoder -> SoftwareBitmap
|
||||
$storageFile = Await ([Windows.Storage.StorageFile]::GetFileFromPathAsync($tmpFile)) ([Windows.Storage.StorageFile])
|
||||
$stream = Await ($storageFile.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
|
||||
$decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
|
||||
$softwareBmp = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
|
||||
|
||||
# Step 4: Create OCR engine
|
||||
$ocrLang = New-Object Windows.Globalization.Language('${lang}')
|
||||
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
|
||||
if ($engine -eq $null) {
|
||||
# Fallback to en-US
|
||||
$ocrLang = New-Object Windows.Globalization.Language('en-US')
|
||||
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
|
||||
}
|
||||
if ($engine -eq $null) {
|
||||
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
|
||||
return
|
||||
}
|
||||
|
||||
# Step 5: Run OCR
|
||||
$ocrResult = Await ($engine.RecognizeAsync($softwareBmp)) ([Windows.Media.Ocr.OcrResult])
|
||||
|
||||
# Step 6: Extract lines with bounding rects
|
||||
$lines = @()
|
||||
foreach ($line in $ocrResult.Lines) {
|
||||
$minX = [double]::MaxValue; $minY = [double]::MaxValue
|
||||
$maxX = 0.0; $maxY = 0.0
|
||||
foreach ($word in $line.Words) {
|
||||
$r = $word.BoundingRect
|
||||
if ($r.X -lt $minX) { $minX = $r.X }
|
||||
if ($r.Y -lt $minY) { $minY = $r.Y }
|
||||
if (($r.X + $r.Width) -gt $maxX) { $maxX = $r.X + $r.Width }
|
||||
if (($r.Y + $r.Height) -gt $maxY) { $maxY = $r.Y + $r.Height }
|
||||
}
|
||||
$lines += @{
|
||||
text = $line.Text
|
||||
bounds = @{
|
||||
x = [int]$minX
|
||||
y = [int]$minY
|
||||
w = [int]($maxX - $minX)
|
||||
h = [int]($maxY - $minY)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
$output = @{
|
||||
text = $ocrResult.Text
|
||||
lines = $lines
|
||||
language = $ocrLang.LanguageTag
|
||||
}
|
||||
Write-Output (ConvertTo-Json $output -Depth 4 -Compress)
|
||||
|
||||
# Cleanup
|
||||
$stream.Dispose()
|
||||
Remove-Item $tmpFile -ErrorAction SilentlyContinue
|
||||
} catch {
|
||||
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
|
||||
}
|
||||
`
|
||||
}
|
||||
|
||||
/**
|
||||
* PowerShell script to get a window's bounding rect by title.
|
||||
*/
|
||||
function buildGetWindowRectScript(windowTitle: string): string {
|
||||
const escaped = windowTitle.replace(/'/g, "''")
|
||||
return `
|
||||
Add-Type @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
public class WinRect {
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||
public static extern IntPtr FindWindow(string c, string t);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct RECT { public int L, T, R, B; }
|
||||
public static string Get(string title) {
|
||||
IntPtr hwnd = FindWindow(null, title);
|
||||
if (hwnd == IntPtr.Zero) return "NOT_FOUND";
|
||||
RECT r; GetWindowRect(hwnd, out r);
|
||||
int w = r.R - r.L; int h = r.B - r.T;
|
||||
if (w <= 0 || h <= 0) return "INVALID_SIZE";
|
||||
return r.L + "," + r.T + "," + w + "," + h;
|
||||
}
|
||||
}
|
||||
'@
|
||||
[WinRect]::Get('${escaped}')
|
||||
`
|
||||
}
|
||||
|
||||
function parseOcrOutput(raw: string, lang: string): OcrResult {
|
||||
if (!raw) return emptyResult(lang)
|
||||
try {
|
||||
const parsed = JSON.parse(raw)
|
||||
return {
|
||||
text: parsed.text ?? '',
|
||||
lines: Array.isArray(parsed.lines)
|
||||
? parsed.lines.map((l: any) => ({
|
||||
text: l.text ?? '',
|
||||
bounds: {
|
||||
x: l.bounds?.x ?? 0,
|
||||
y: l.bounds?.y ?? 0,
|
||||
w: l.bounds?.w ?? 0,
|
||||
h: l.bounds?.h ?? 0,
|
||||
},
|
||||
}))
|
||||
: [],
|
||||
language: parsed.language ?? lang,
|
||||
}
|
||||
} catch {
|
||||
return emptyResult(lang)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform OCR on a screen region.
|
||||
* Screenshots the specified rectangle, then runs WinRT OcrEngine.
|
||||
*
|
||||
* @param x - Left coordinate
|
||||
* @param y - Top coordinate
|
||||
* @param w - Width in pixels
|
||||
* @param h - Height in pixels
|
||||
* @param lang - BCP-47 language tag (default 'en-US'). Confirmed: 'en-US', 'zh-Hans-CN'
|
||||
*/
|
||||
export async function ocrRegion(
|
||||
x: number,
|
||||
y: number,
|
||||
w: number,
|
||||
h: number,
|
||||
lang?: string,
|
||||
): Promise<OcrResult> {
|
||||
const language = lang ?? 'en-US'
|
||||
if (w <= 0 || h <= 0) return emptyResult(language)
|
||||
|
||||
try {
|
||||
const script = buildOcrRegionScript(x, y, w, h, language)
|
||||
const raw = runPs(script)
|
||||
return parseOcrOutput(raw, language)
|
||||
} catch {
|
||||
return emptyResult(language)
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Perform OCR on a specific window by its title.
|
||||
* Gets the window rect, then delegates to ocrRegion.
|
||||
*
|
||||
* @param windowTitle - Exact window title to find via FindWindow
|
||||
* @param lang - BCP-47 language tag (default 'en-US')
|
||||
*/
|
||||
export async function ocrWindow(
|
||||
windowTitle: string,
|
||||
lang?: string,
|
||||
): Promise<OcrResult> {
|
||||
const language = lang ?? 'en-US'
|
||||
|
||||
try {
|
||||
const rectScript = buildGetWindowRectScript(windowTitle)
|
||||
const raw = runPs(rectScript)
|
||||
const trimmed = raw.trim()
|
||||
|
||||
if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') {
|
||||
return emptyResult(language)
|
||||
}
|
||||
|
||||
const parts = trimmed.split(',')
|
||||
if (parts.length !== 4) return emptyResult(language)
|
||||
|
||||
const [x, y, w, h] = parts.map(Number)
|
||||
if (!w || !h) return emptyResult(language)
|
||||
|
||||
return ocrRegion(x, y, w, h, lang)
|
||||
} catch {
|
||||
return emptyResult(language)
|
||||
}
|
||||
}
|
||||
308
src/utils/computerUse/win32/uiAutomation.ts
Normal file
308
src/utils/computerUse/win32/uiAutomation.ts
Normal file
@@ -0,0 +1,308 @@
|
||||
/**
|
||||
* Windows UI Automation module
|
||||
*
|
||||
* Provides UI element tree inspection, element lookup, programmatic click,
|
||||
* value setting, and hit-testing via PowerShell + System.Windows.Automation.
|
||||
*/
|
||||
|
||||
export interface UIElement {
|
||||
name: string
|
||||
controlType: string // Button, Edit, Text, List, Window, etc.
|
||||
automationId: string
|
||||
boundingRect: { x: number; y: number; w: number; h: number }
|
||||
isEnabled: boolean
|
||||
value?: string
|
||||
children?: UIElement[]
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Helper
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const UIA_ASSEMBLIES = `
|
||||
Add-Type -AssemblyName UIAutomationClient
|
||||
Add-Type -AssemblyName UIAutomationTypes
|
||||
Add-Type -AssemblyName WindowsBase
|
||||
`
|
||||
|
||||
function ps(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
function parseJsonSafe<T>(raw: string, fallback: T): T {
|
||||
try {
|
||||
if (!raw) return fallback
|
||||
return JSON.parse(raw) as T
|
||||
} catch {
|
||||
return fallback
|
||||
}
|
||||
}
|
||||
|
||||
// PowerShell snippet that finds a window by exact or partial title match.
|
||||
// Assumes $title is already set in the calling script.
|
||||
const PS_FIND_WINDOW = `
|
||||
$root = [System.Windows.Automation.AutomationElement]::RootElement
|
||||
$window = $root.FindFirst(
|
||||
[System.Windows.Automation.TreeScope]::Children,
|
||||
[System.Windows.Automation.PropertyCondition]::new(
|
||||
[System.Windows.Automation.AutomationElement]::NameProperty, $title))
|
||||
if ($window -eq $null) {
|
||||
$all = $root.FindAll(
|
||||
[System.Windows.Automation.TreeScope]::Children,
|
||||
[System.Windows.Automation.Condition]::TrueCondition)
|
||||
foreach ($el in $all) {
|
||||
if ($el.Current.Name -and $el.Current.Name.Contains($title)) {
|
||||
$window = $el
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
`
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Public API
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/**
|
||||
* Get the UI element tree of a window, up to `depth` levels deep (default 3).
|
||||
*/
|
||||
export function getUITree(windowTitle: string, depth: number = 3): UIElement[] {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output '[]'
|
||||
exit
|
||||
}
|
||||
|
||||
function Get-UIChildren($parent, $currentDepth, $maxDepth) {
|
||||
if ($currentDepth -ge $maxDepth) { return @() }
|
||||
$children = $parent.FindAll(
|
||||
[System.Windows.Automation.TreeScope]::Children,
|
||||
[System.Windows.Automation.Condition]::TrueCondition)
|
||||
$result = @()
|
||||
foreach ($el in $children) {
|
||||
$rect = $el.Current.BoundingRectangle
|
||||
$obj = @{
|
||||
name = [string]$el.Current.Name
|
||||
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
|
||||
automationId = [string]$el.Current.AutomationId
|
||||
boundingRect = @{
|
||||
x = [int]$rect.X
|
||||
y = [int]$rect.Y
|
||||
w = [int]$rect.Width
|
||||
h = [int]$rect.Height
|
||||
}
|
||||
isEnabled = $el.Current.IsEnabled
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
|
||||
} catch {}
|
||||
$sub = Get-UIChildren $el ($currentDepth + 1) $maxDepth
|
||||
if ($sub.Count -gt 0) { $obj['children'] = $sub }
|
||||
$result += $obj
|
||||
}
|
||||
return $result
|
||||
}
|
||||
|
||||
$tree = Get-UIChildren $window 0 ${depth}
|
||||
if ($tree -eq $null -or $tree.Count -eq 0) {
|
||||
Write-Output '[]'
|
||||
} else {
|
||||
$tree | ConvertTo-Json -Depth 20 -Compress
|
||||
}
|
||||
`
|
||||
const raw = ps(script)
|
||||
const parsed = parseJsonSafe<UIElement | UIElement[]>(raw, [])
|
||||
return Array.isArray(parsed) ? parsed : [parsed]
|
||||
}
|
||||
|
||||
/**
|
||||
* Find a single element inside a window matching the given query fields.
|
||||
*/
|
||||
export function findElement(
|
||||
windowTitle: string,
|
||||
query: { name?: string; controlType?: string; automationId?: string },
|
||||
): UIElement | null {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
|
||||
// Build conditions array
|
||||
const conditions: string[] = []
|
||||
if (query.name) {
|
||||
const v = query.name.replace(/'/g, "''")
|
||||
conditions.push(
|
||||
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::NameProperty, '${v}')`,
|
||||
)
|
||||
}
|
||||
if (query.controlType) {
|
||||
const v = query.controlType.replace(/'/g, "''")
|
||||
conditions.push(
|
||||
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::ControlTypeProperty, [System.Windows.Automation.ControlType]::${v})`,
|
||||
)
|
||||
}
|
||||
if (query.automationId) {
|
||||
const v = query.automationId.replace(/'/g, "''")
|
||||
conditions.push(
|
||||
`[System.Windows.Automation.PropertyCondition]::new([System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${v}')`,
|
||||
)
|
||||
}
|
||||
|
||||
if (conditions.length === 0) return null
|
||||
|
||||
let conditionExpr: string
|
||||
if (conditions.length === 1) {
|
||||
conditionExpr = conditions[0]
|
||||
} else {
|
||||
conditionExpr = `[System.Windows.Automation.AndCondition]::new(@(${conditions.join(', ')}))`
|
||||
}
|
||||
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output 'null'
|
||||
exit
|
||||
}
|
||||
$cond = ${conditionExpr}
|
||||
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'null'
|
||||
exit
|
||||
}
|
||||
$rect = $el.Current.BoundingRectangle
|
||||
$obj = @{
|
||||
name = [string]$el.Current.Name
|
||||
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
|
||||
automationId = [string]$el.Current.AutomationId
|
||||
boundingRect = @{
|
||||
x = [int]$rect.X
|
||||
y = [int]$rect.Y
|
||||
w = [int]$rect.Width
|
||||
h = [int]$rect.Height
|
||||
}
|
||||
isEnabled = $el.Current.IsEnabled
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
|
||||
} catch {}
|
||||
$obj | ConvertTo-Json -Compress
|
||||
`
|
||||
const raw = ps(script)
|
||||
return parseJsonSafe<UIElement | null>(raw, null)
|
||||
}
|
||||
|
||||
/**
|
||||
* Click an element by its automationId using InvokePattern.
|
||||
*/
|
||||
export function clickElement(windowTitle: string, automationId: string): boolean {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
const escapedId = automationId.replace(/'/g, "''")
|
||||
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
$cond = [System.Windows.Automation.PropertyCondition]::new(
|
||||
[System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}')
|
||||
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
try {
|
||||
$ip = $el.GetCurrentPattern([System.Windows.Automation.InvokePattern]::Pattern)
|
||||
$ip.Invoke()
|
||||
Write-Output 'true'
|
||||
} catch {
|
||||
Write-Output 'false'
|
||||
}
|
||||
`
|
||||
return ps(script) === 'true'
|
||||
}
|
||||
|
||||
/**
|
||||
* Set the value of an element by its automationId using ValuePattern.
|
||||
*/
|
||||
export function setValue(windowTitle: string, automationId: string, value: string): boolean {
|
||||
const escapedTitle = windowTitle.replace(/'/g, "''")
|
||||
const escapedId = automationId.replace(/'/g, "''")
|
||||
const escapedValue = value.replace(/'/g, "''")
|
||||
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
$title = '${escapedTitle}'
|
||||
${PS_FIND_WINDOW}
|
||||
if ($window -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
$cond = [System.Windows.Automation.PropertyCondition]::new(
|
||||
[System.Windows.Automation.AutomationElement]::AutomationIdProperty, '${escapedId}')
|
||||
$el = $window.FindFirst([System.Windows.Automation.TreeScope]::Descendants, $cond)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'false'
|
||||
exit
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
$vp.SetValue('${escapedValue}')
|
||||
Write-Output 'true'
|
||||
} catch {
|
||||
Write-Output 'false'
|
||||
}
|
||||
`
|
||||
return ps(script) === 'true'
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the UI element at a specific screen coordinate.
|
||||
*/
|
||||
export function elementAtPoint(x: number, y: number): UIElement | null {
|
||||
const script = `
|
||||
${UIA_ASSEMBLIES}
|
||||
try {
|
||||
$point = [System.Windows.Point]::new(${x}, ${y})
|
||||
$el = [System.Windows.Automation.AutomationElement]::FromPoint($point)
|
||||
if ($el -eq $null) {
|
||||
Write-Output 'null'
|
||||
exit
|
||||
}
|
||||
$rect = $el.Current.BoundingRectangle
|
||||
$obj = @{
|
||||
name = [string]$el.Current.Name
|
||||
controlType = $el.Current.ControlType.ProgrammaticName -replace 'ControlType\\.', ''
|
||||
automationId = [string]$el.Current.AutomationId
|
||||
boundingRect = @{
|
||||
x = [int]$rect.X
|
||||
y = [int]$rect.Y
|
||||
w = [int]$rect.Width
|
||||
h = [int]$rect.Height
|
||||
}
|
||||
isEnabled = $el.Current.IsEnabled
|
||||
}
|
||||
try {
|
||||
$vp = $el.GetCurrentPattern([System.Windows.Automation.ValuePattern]::Pattern)
|
||||
if ($vp -ne $null) { $obj['value'] = $vp.Current.Value }
|
||||
} catch {}
|
||||
$obj | ConvertTo-Json -Compress
|
||||
} catch {
|
||||
Write-Output 'null'
|
||||
}
|
||||
`
|
||||
const raw = ps(script)
|
||||
return parseJsonSafe<UIElement | null>(raw, null)
|
||||
}
|
||||
129
src/utils/computerUse/win32/windowCapture.ts
Normal file
129
src/utils/computerUse/win32/windowCapture.ts
Normal file
@@ -0,0 +1,129 @@
|
||||
/**
|
||||
* Window-level screenshot capture using Win32 PrintWindow API.
|
||||
* Captures windows even when occluded or minimized.
|
||||
*/
|
||||
|
||||
interface CaptureResult {
|
||||
base64: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
const CAPTURE_BY_TITLE_PS = `
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
Add-Type -ReferencedAssemblies System.Drawing @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
public class WinCap {
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||
public static extern IntPtr FindWindow(string c, string t);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f);
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct RECT { public int L, T, R, B; }
|
||||
|
||||
public static string Capture(string title) {
|
||||
IntPtr hwnd = FindWindow(null, title);
|
||||
if (hwnd == IntPtr.Zero) return "NOT_FOUND";
|
||||
RECT r; GetWindowRect(hwnd, out r);
|
||||
int w = r.R - r.L; int h = r.B - r.T;
|
||||
if (w <= 0 || h <= 0) return "INVALID_SIZE";
|
||||
Bitmap bmp = new Bitmap(w, h);
|
||||
Graphics g = Graphics.FromImage(bmp);
|
||||
IntPtr hdc = g.GetHdc();
|
||||
PrintWindow(hwnd, hdc, 2);
|
||||
g.ReleaseHdc(hdc); g.Dispose();
|
||||
var ms = new System.IO.MemoryStream();
|
||||
bmp.Save(ms, ImageFormat.Png);
|
||||
bmp.Dispose();
|
||||
return w + "," + h + "," + Convert.ToBase64String(ms.ToArray());
|
||||
}
|
||||
}
|
||||
'@
|
||||
`
|
||||
|
||||
const CAPTURE_BY_HWND_PS = `
|
||||
Add-Type -AssemblyName System.Drawing
|
||||
Add-Type -ReferencedAssemblies System.Drawing @'
|
||||
using System;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Drawing;
|
||||
using System.Drawing.Imaging;
|
||||
public class WinCapH {
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool GetWindowRect(IntPtr h, out RECT r);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool PrintWindow(IntPtr h, IntPtr hdc, uint f);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool IsWindow(IntPtr hWnd);
|
||||
[StructLayout(LayoutKind.Sequential)]
|
||||
public struct RECT { public int L, T, R, B; }
|
||||
|
||||
public static string Capture(IntPtr hwnd) {
|
||||
if (!IsWindow(hwnd)) return "NOT_FOUND";
|
||||
RECT r; GetWindowRect(hwnd, out r);
|
||||
int w = r.R - r.L; int h = r.B - r.T;
|
||||
if (w <= 0 || h <= 0) return "INVALID_SIZE";
|
||||
Bitmap bmp = new Bitmap(w, h);
|
||||
Graphics g = Graphics.FromImage(bmp);
|
||||
IntPtr hdc = g.GetHdc();
|
||||
PrintWindow(hwnd, hdc, 2);
|
||||
g.ReleaseHdc(hdc); g.Dispose();
|
||||
var ms = new System.IO.MemoryStream();
|
||||
bmp.Save(ms, ImageFormat.Png);
|
||||
bmp.Dispose();
|
||||
return w + "," + h + "," + Convert.ToBase64String(ms.ToArray());
|
||||
}
|
||||
}
|
||||
'@
|
||||
`
|
||||
|
||||
function parseCaptureOutput(raw: string): CaptureResult | null {
|
||||
const trimmed = raw.trim()
|
||||
if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') {
|
||||
return null
|
||||
}
|
||||
const firstComma = trimmed.indexOf(',')
|
||||
const secondComma = trimmed.indexOf(',', firstComma + 1)
|
||||
if (firstComma === -1 || secondComma === -1) return null
|
||||
|
||||
const width = Number(trimmed.slice(0, firstComma))
|
||||
const height = Number(trimmed.slice(firstComma + 1, secondComma))
|
||||
const base64 = trimmed.slice(secondComma + 1)
|
||||
|
||||
if (!width || !height || !base64) return null
|
||||
return { base64, width, height }
|
||||
}
|
||||
|
||||
function runPs(script: string): string {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', script],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
return new TextDecoder().decode(result.stdout).trim()
|
||||
}
|
||||
|
||||
/**
|
||||
* Capture a window screenshot by its exact title.
|
||||
* Uses PrintWindow which works even for occluded/background windows.
|
||||
*/
|
||||
export function captureWindow(title: string): CaptureResult | null {
|
||||
const escaped = title.replace(/'/g, "''")
|
||||
const script = `${CAPTURE_BY_TITLE_PS}\n[WinCap]::Capture('${escaped}')`
|
||||
const raw = runPs(script)
|
||||
return parseCaptureOutput(raw)
|
||||
}
|
||||
|
||||
/**
|
||||
* Capture a window screenshot by its HWND handle.
|
||||
*/
|
||||
export function captureWindowByHwnd(hwnd: number): CaptureResult | null {
|
||||
const script = `${CAPTURE_BY_HWND_PS}\n[WinCapH]::Capture([IntPtr]::new(${hwnd}))`
|
||||
const raw = runPs(script)
|
||||
return parseCaptureOutput(raw)
|
||||
}
|
||||
86
src/utils/computerUse/win32/windowEnum.ts
Normal file
86
src/utils/computerUse/win32/windowEnum.ts
Normal file
@@ -0,0 +1,86 @@
|
||||
/**
|
||||
* Window enumeration using Win32 EnumWindows API.
|
||||
* Returns visible windows with their HWND, PID, and title.
|
||||
*/
|
||||
|
||||
export interface WindowInfo {
|
||||
hwnd: number
|
||||
pid: number
|
||||
title: string
|
||||
}
|
||||
|
||||
const ENUM_WINDOWS_PS = `
|
||||
Add-Type @'
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Runtime.InteropServices;
|
||||
using System.Text;
|
||||
public class WinEnum {
|
||||
public delegate bool EnumWindowsProc(IntPtr hWnd, IntPtr lParam);
|
||||
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool EnumWindows(EnumWindowsProc lpEnumFunc, IntPtr lParam);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern bool IsWindowVisible(IntPtr hWnd);
|
||||
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
|
||||
public static extern int GetWindowText(IntPtr hWnd, StringBuilder lpString, int nMaxCount);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern int GetWindowTextLength(IntPtr hWnd);
|
||||
[DllImport("user32.dll")]
|
||||
public static extern uint GetWindowThreadProcessId(IntPtr hWnd, out uint processId);
|
||||
|
||||
public static List<string> results = new List<string>();
|
||||
|
||||
public static void Run() {
|
||||
results.Clear();
|
||||
EnumWindows(delegate(IntPtr hWnd, IntPtr lParam) {
|
||||
if (!IsWindowVisible(hWnd)) return true;
|
||||
int len = GetWindowTextLength(hWnd);
|
||||
if (len == 0) return true;
|
||||
StringBuilder sb = new StringBuilder(len + 1);
|
||||
GetWindowText(hWnd, sb, sb.Capacity);
|
||||
string title = sb.ToString();
|
||||
if (string.IsNullOrWhiteSpace(title)) return true;
|
||||
uint pid = 0;
|
||||
GetWindowThreadProcessId(hWnd, out pid);
|
||||
results.Add(hWnd.ToInt64() + "|" + pid + "|" + title);
|
||||
return true;
|
||||
}, IntPtr.Zero);
|
||||
}
|
||||
}
|
||||
'@
|
||||
[WinEnum]::Run()
|
||||
[WinEnum]::results | ForEach-Object { $_ }
|
||||
`
|
||||
|
||||
/**
|
||||
* List all visible windows with non-empty titles.
|
||||
* Returns HWND, PID, and window title for each.
|
||||
*/
|
||||
export function listWindows(): WindowInfo[] {
|
||||
const result = Bun.spawnSync({
|
||||
cmd: ['powershell', '-NoProfile', '-NonInteractive', '-Command', ENUM_WINDOWS_PS],
|
||||
stdout: 'pipe',
|
||||
stderr: 'pipe',
|
||||
})
|
||||
const raw = new TextDecoder().decode(result.stdout).trim()
|
||||
if (!raw) return []
|
||||
|
||||
return raw
|
||||
.split('\n')
|
||||
.filter(Boolean)
|
||||
.map(line => {
|
||||
const trimmed = line.trim()
|
||||
const firstPipe = trimmed.indexOf('|')
|
||||
const secondPipe = trimmed.indexOf('|', firstPipe + 1)
|
||||
if (firstPipe === -1 || secondPipe === -1) return null
|
||||
|
||||
const hwnd = Number(trimmed.slice(0, firstPipe))
|
||||
const pid = Number(trimmed.slice(firstPipe + 1, secondPipe))
|
||||
const title = trimmed.slice(secondPipe + 1)
|
||||
|
||||
if (isNaN(hwnd) || isNaN(pid) || !title) return null
|
||||
return { hwnd, pid, title }
|
||||
})
|
||||
.filter((item): item is WindowInfo => item !== null)
|
||||
}
|
||||
Reference in New Issue
Block a user