feat: Windows Computer Use enhancement — PrintWindow, UI Automation, OCR

New Windows-native capabilities:
- windowCapture.ts: PrintWindow API for per-window screenshot (works on
  occluded/background windows)
- windowEnum.ts: EnumWindows for precise window enumeration with HWND
- uiAutomation.ts: IUIAutomation for UI tree reading, element clicking,
  text input, and coordinate-based element identification
- ocr.ts: Windows.Media.Ocr for screen text recognition (en-US + zh-CN)

Updated win32.ts backend to use EnumWindows for listRunning() and added
captureWindowTarget() for window-specific screenshots.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
unraid
2026-04-04 00:00:02 +08:00
parent e3264a1691
commit 3707c3c0ba
7 changed files with 1141 additions and 5 deletions

View File

@@ -11,6 +11,9 @@ import type {
SwiftBackend, WindowDisplayInfo,
} from '../types.js'
import { listWindows } from 'src/utils/computerUse/win32/windowEnum.js'
import { captureWindow, captureWindowByHwnd } from 'src/utils/computerUse/win32/windowCapture.js'
// ---------------------------------------------------------------------------
// PowerShell helper
// ---------------------------------------------------------------------------
@@ -155,11 +158,11 @@ $apps | Select-Object -Unique | Select-Object -First 200
listRunning() {
try {
const raw = ps(`Get-Process | Where-Object { $_.MainWindowTitle -ne '' } | Select-Object -First 50 | ForEach-Object { "$($_.MainModule.FileName)|$($_.ProcessName)" }`)
return raw.split('\n').filter(Boolean).map(line => {
const [exePath, name] = line.split('|', 2)
return { bundleId: exePath ?? '', displayName: name ?? '' }
})
const windows = listWindows()
return windows.map(w => ({
bundleId: String(w.hwnd),
displayName: w.title,
}))
} catch {
return []
}
@@ -246,4 +249,15 @@ $ms.Dispose()
const base64 = raw.slice(secondComma + 1)
return { base64, width: w, height: h }
},
/**
* Capture a specific window by title or HWND using PrintWindow.
* Works even for occluded or background windows.
*/
captureWindowTarget(titleOrHwnd: string | number): ScreenshotResult | null {
if (typeof titleOrHwnd === 'number') {
return captureWindowByHwnd(titleOrHwnd)
}
return captureWindow(titleOrHwnd)
},
}