Files
claude-code/src/utils/computerUse/win32/ocr.ts
unraid c17edcb12e feat: Computer Use — Windows 跨平台支持 + GUI 无障碍增强 + Python Bridge
三平台 Computer Use (macOS + Windows + Linux),Windows 专项增强。

- MCP server: toolCalls/tools/executor/mcpServer 等 12 文件完整实现
- 平台抽象层: platforms/{win32,darwin,linux}.ts
- 跨平台 executor: executorCrossPlatform.ts
- CHICAGO_MCP + VOICE_MODE feature flags 启用

- windowMessage.ts: SendMessageW (WM_CHAR Unicode + 剪贴板粘贴)
- windowBorder.ts: 4 叠加窗口边框 (30fps 跟踪)
- uiAutomation.ts: UI Automation 元素树/点击/写值
- accessibilitySnapshot.ts: 无障碍快照 → 模型感知 GUI
- bridge.py + bridgeClient.ts: Python 长驻进程 (替代 per-call PS)

- window_management: min/max/restore/close/focus (Win32 API)
- click_element / type_into_element: 按名称操作 (无需坐标)
- 截图自动附带 Accessibility Snapshot

- 17 种方法, stdin/stdout JSON 通信
- 窗口枚举 1.5ms vs PS 500ms, 截图 360ms vs PS 800ms
- 依赖: mss + Pillow + pywinauto
2026-04-05 15:47:20 +08:00

251 lines
7.9 KiB
TypeScript

/**
* OCR module using Windows.Media.Ocr.OcrEngine via PowerShell.
* Captures a screen region or window, then runs WinRT OCR to extract text.
*/
import { ps as runPs } from './shared.js'
export interface OcrLine {
text: string
bounds: { x: number; y: number; w: number; h: number }
}
export interface OcrResult {
text: string
lines: OcrLine[]
language: string
}
function emptyResult(language: string): OcrResult {
return { text: '', lines: [], language }
}
/**
* PowerShell script that:
* 1. Screenshots a screen region using CopyFromScreen
* 2. Saves to temp PNG
* 3. Loads via WinRT BitmapDecoder -> SoftwareBitmap
* 4. Runs OcrEngine.RecognizeAsync
* 5. Outputs JSON with text, lines, and bounding rects
*/
function buildOcrRegionScript(
x: number,
y: number,
w: number,
h: number,
lang: string,
): string {
return `
Add-Type -AssemblyName System.Drawing
Add-Type -AssemblyName System.Runtime.WindowsRuntime
# Load WinRT types
$null = [Windows.Media.Ocr.OcrEngine, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Graphics.Imaging.SoftwareBitmap, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Graphics.Imaging.BitmapDecoder, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Storage.StorageFile, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Storage.Streams.RandomAccessStream, Windows.Foundation, ContentType = WindowsRuntime]
$null = [Windows.Globalization.Language, Windows.Foundation, ContentType = WindowsRuntime]
# Await helper for WinRT async operations
$asTaskGeneric = ([System.WindowsRuntimeSystemExtensions].GetMethods() | Where-Object {
$_.Name -eq 'AsTask' -and $_.GetParameters().Count -eq 1 -and
$_.GetParameters()[0].ParameterType.Name -eq 'IAsyncOperation\`1'
})[0]
Function Await($WinRtTask, $ResultType) {
$asTask = $asTaskGeneric.MakeGenericMethod($ResultType)
$netTask = $asTask.Invoke($null, @($WinRtTask))
$netTask.Wait(-1) | Out-Null
$netTask.Result
}
try {
# Step 1: Screenshot region
$bmp = New-Object System.Drawing.Bitmap(${w}, ${h})
$g = [System.Drawing.Graphics]::FromImage($bmp)
$g.CopyFromScreen(${x}, ${y}, 0, 0, (New-Object System.Drawing.Size(${w}, ${h})))
$g.Dispose()
# Step 2: Save to temp file
$tmpFile = [System.IO.Path]::Combine([System.IO.Path]::GetTempPath(), "ocrtemp_$([guid]::NewGuid().ToString('N')).png")
$bmp.Save($tmpFile, [System.Drawing.Imaging.ImageFormat]::Png)
$bmp.Dispose()
# Step 3: Open as StorageFile -> BitmapDecoder -> SoftwareBitmap
$storageFile = Await ([Windows.Storage.StorageFile]::GetFileFromPathAsync($tmpFile)) ([Windows.Storage.StorageFile])
$stream = Await ($storageFile.OpenAsync([Windows.Storage.FileAccessMode]::Read)) ([Windows.Storage.Streams.IRandomAccessStream])
$decoder = Await ([Windows.Graphics.Imaging.BitmapDecoder]::CreateAsync($stream)) ([Windows.Graphics.Imaging.BitmapDecoder])
$softwareBmp = Await ($decoder.GetSoftwareBitmapAsync()) ([Windows.Graphics.Imaging.SoftwareBitmap])
# Step 4: Create OCR engine
$ocrLang = New-Object Windows.Globalization.Language('${lang}')
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
if ($engine -eq $null) {
# Fallback to en-US
$ocrLang = New-Object Windows.Globalization.Language('en-US')
$engine = [Windows.Media.Ocr.OcrEngine]::TryCreateFromLanguage($ocrLang)
}
if ($engine -eq $null) {
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
return
}
# Step 5: Run OCR
$ocrResult = Await ($engine.RecognizeAsync($softwareBmp)) ([Windows.Media.Ocr.OcrResult])
# Step 6: Extract lines with bounding rects
$lines = @()
foreach ($line in $ocrResult.Lines) {
$minX = [double]::MaxValue; $minY = [double]::MaxValue
$maxX = 0.0; $maxY = 0.0
foreach ($word in $line.Words) {
$r = $word.BoundingRect
if ($r.X -lt $minX) { $minX = $r.X }
if ($r.Y -lt $minY) { $minY = $r.Y }
if (($r.X + $r.Width) -gt $maxX) { $maxX = $r.X + $r.Width }
if (($r.Y + $r.Height) -gt $maxY) { $maxY = $r.Y + $r.Height }
}
$lines += @{
text = $line.Text
bounds = @{
x = [int]$minX
y = [int]$minY
w = [int]($maxX - $minX)
h = [int]($maxY - $minY)
}
}
}
$output = @{
text = $ocrResult.Text
lines = $lines
language = $ocrLang.LanguageTag
}
Write-Output (ConvertTo-Json $output -Depth 4 -Compress)
# Cleanup
$stream.Dispose()
Remove-Item $tmpFile -ErrorAction SilentlyContinue
} catch {
Write-Output '{"text":"","lines":[],"language":"${lang}"}'
}
`
}
/**
* PowerShell script to get a window's bounding rect by title.
*/
function buildGetWindowRectScript(windowTitle: string): string {
const escaped = windowTitle.replace(/'/g, "''")
return `
Add-Type @'
using System;
using System.Runtime.InteropServices;
public class WinRect {
[DllImport("user32.dll", CharSet=CharSet.Unicode)]
public static extern IntPtr FindWindow(string c, string t);
[DllImport("user32.dll")]
public static extern bool GetWindowRect(IntPtr h, out RECT r);
[StructLayout(LayoutKind.Sequential)]
public struct RECT { public int L, T, R, B; }
public static string Get(string title) {
IntPtr hwnd = FindWindow(null, title);
if (hwnd == IntPtr.Zero) return "NOT_FOUND";
RECT r; GetWindowRect(hwnd, out r);
int w = r.R - r.L; int h = r.B - r.T;
if (w <= 0 || h <= 0) return "INVALID_SIZE";
return r.L + "," + r.T + "," + w + "," + h;
}
}
'@
[WinRect]::Get('${escaped}')
`
}
function parseOcrOutput(raw: string, lang: string): OcrResult {
if (!raw) return emptyResult(lang)
try {
const parsed = JSON.parse(raw)
return {
text: parsed.text ?? '',
lines: Array.isArray(parsed.lines)
? parsed.lines.map((l: any) => ({
text: l.text ?? '',
bounds: {
x: l.bounds?.x ?? 0,
y: l.bounds?.y ?? 0,
w: l.bounds?.w ?? 0,
h: l.bounds?.h ?? 0,
},
}))
: [],
language: parsed.language ?? lang,
}
} catch {
return emptyResult(lang)
}
}
/**
* Perform OCR on a screen region.
* Screenshots the specified rectangle, then runs WinRT OcrEngine.
*
* @param x - Left coordinate
* @param y - Top coordinate
* @param w - Width in pixels
* @param h - Height in pixels
* @param lang - BCP-47 language tag (default 'en-US'). Confirmed: 'en-US', 'zh-Hans-CN'
*/
export async function ocrRegion(
x: number,
y: number,
w: number,
h: number,
lang?: string,
): Promise<OcrResult> {
const language = lang ?? 'en-US'
if (w <= 0 || h <= 0) return emptyResult(language)
try {
const script = buildOcrRegionScript(x, y, w, h, language)
const raw = runPs(script)
return parseOcrOutput(raw, language)
} catch {
return emptyResult(language)
}
}
/**
* Perform OCR on a specific window by its title.
* Gets the window rect, then delegates to ocrRegion.
*
* @param windowTitle - Exact window title to find via FindWindow
* @param lang - BCP-47 language tag (default 'en-US')
*/
export async function ocrWindow(
windowTitle: string,
lang?: string,
): Promise<OcrResult> {
const language = lang ?? 'en-US'
try {
const rectScript = buildGetWindowRectScript(windowTitle)
const raw = runPs(rectScript)
const trimmed = raw.trim()
if (!trimmed || trimmed === 'NOT_FOUND' || trimmed === 'INVALID_SIZE') {
return emptyResult(language)
}
const parts = trimmed.split(',')
if (parts.length !== 4) return emptyResult(language)
const [x, y, w, h] = parts.map(Number)
if (!w || !h) return emptyResult(language)
return ocrRegion(x, y, w, h, lang)
} catch {
return emptyResult(language)
}
}