Files
claude-code/src/utils/computerUse/wrapper.tsx
claude-code-best 227083d31f fix: 修复截图 MIME 类型硬编码导致 API 拒绝的问题
macOS screencapture 输出 PNG,但代码硬编码 mimeType 为 image/jpeg,
导致 API 报错 "specified using image/jpeg but appears to be image/png"。
改为通过 magic bytes 检测实际图片格式。

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-12 17:32:28 +08:00

400 lines
15 KiB
TypeScript

/**
* The `.call()` override — thin adapter between `ToolUseContext` and
* `bindSessionContext`. Spread into the MCP tool object in `client.ts`
* (same pattern as Chrome's rendering overrides, plus `.call()`).
*
* The wrapper-closure logic (build overrides fresh, lock gate, permission
* merge, screenshot stash) lives in `@ant/computer-use-mcp`'s
* `bindSessionContext`. This file binds it once per process,
* caches the dispatcher, and updates a per-call ref for the pieces of
* `ToolUseContext` that vary per-call (`abortController`, `setToolJSX`,
* `sendOSNotification`). AppState accessors are read through the ref too —
* they're likely stable but we don't depend on that.
*
* External callers reach this via the lazy require thunk in `client.ts`, gated
* on `feature('CHICAGO_MCP')`. Runtime enablement is controlled by the
* GrowthBook gate `tengu_malort_pedway` (see gates.ts).
*/
import {
bindSessionContext,
type ComputerUseSessionContext,
type CuCallToolResult,
type CuPermissionRequest,
type CuPermissionResponse,
DEFAULT_GRANT_FLAGS,
type ScreenshotDims,
} from '@ant/computer-use-mcp'
import * as React from 'react'
import { getSessionId } from '../../bootstrap/state.js'
import { ComputerUseApproval } from '../../components/permissions/ComputerUseApproval/ComputerUseApproval.js'
import type { Tool, ToolUseContext } from '../../Tool.js'
import { logForDebugging } from '../debug.js'
import { detectImageFormatFromBase64 } from '../imageResizer.js'
import {
checkComputerUseLock,
tryAcquireComputerUseLock,
} from './computerUseLock.js'
import { registerEscHotkey } from './escHotkey.js'
import { getChicagoCoordinateMode } from './gates.js'
import { getComputerUseHostAdapter } from './hostAdapter.js'
import { getComputerUseMCPRenderingOverrides } from './toolRendering.js'
type CallOverride = Pick<Tool, 'call'>['call']
type Binding = {
ctx: ComputerUseSessionContext
dispatch: (name: string, args: unknown) => Promise<CuCallToolResult>
}
/**
* Cached binding — built on first `.call()`, reused for process lifetime.
* The dispatcher's closure-held screenshot blob persists across calls.
*
* `currentToolUseContext` is updated on every call. Every getter/callback in
* `ctx` reads through it, so the per-call pieces (`abortController`,
* `setToolJSX`, `sendOSNotification`) are always current.
*
* Module-level `let` is a deliberate exception to the no-module-scope-state
* rule (src/CLAUDE.md): the dispatcher closure must persist across calls so
* its internal screenshot blob survives, but `ToolUseContext` is per-call.
* Tests will need to either inject the cache or run serially.
*/
let binding: Binding | undefined
let currentToolUseContext: ToolUseContext | undefined
function tuc(): ToolUseContext {
// Safe: `binding` is only populated when `currentToolUseContext` is set.
// Called only from within `ctx` callbacks, which only fire during dispatch.
return currentToolUseContext!
}
function formatLockHeld(holder: string): string {
return `Computer use is in use by another Claude session (${holder.slice(0, 8)}…). Wait for that session to finish or run /exit there.`
}
export function buildSessionContext(): ComputerUseSessionContext {
return {
// ── Read state fresh via the per-call ref ─────────────────────────────
getAllowedApps: () =>
tuc().getAppState().computerUseMcpState?.allowedApps ?? [],
getGrantFlags: () =>
tuc().getAppState().computerUseMcpState?.grantFlags ??
DEFAULT_GRANT_FLAGS,
// cc-2 has no Settings page for user-denied apps yet.
getUserDeniedBundleIds: () => [],
getSelectedDisplayId: () =>
tuc().getAppState().computerUseMcpState?.selectedDisplayId,
getDisplayPinnedByModel: () =>
tuc().getAppState().computerUseMcpState?.displayPinnedByModel ?? false,
getDisplayResolvedForApps: () =>
tuc().getAppState().computerUseMcpState?.displayResolvedForApps,
getLastScreenshotDims: (): ScreenshotDims | undefined => {
const d = tuc().getAppState().computerUseMcpState?.lastScreenshotDims
return d
? {
...d,
displayId: d.displayId ?? 0,
originX: d.originX ?? 0,
originY: d.originY ?? 0,
}
: undefined
},
// ── Write-backs ────────────────────────────────────────────────────────
// `setToolJSX` is guaranteed present — the gate in `main.tsx` excludes
// non-interactive sessions. The package's `_dialogSignal` (tool-finished
// dismissal) is irrelevant here: `setToolJSX` blocks the tool call, so
// the dialog can't outlive it. Ctrl+C is what matters, and
// `runPermissionDialog` wires that from the per-call ref's abortController.
onPermissionRequest: (req, _dialogSignal) => runPermissionDialog(req),
// Package does the merge (dedupe + truthy-only flags). We just persist.
onAllowedAppsChanged: (apps, flags) =>
tuc().setAppState(prev => {
const cu = prev.computerUseMcpState
const prevApps = cu?.allowedApps
const prevFlags = cu?.grantFlags
const sameApps =
prevApps?.length === apps.length &&
apps.every((a, i) => prevApps[i]?.bundleId === a.bundleId)
const sameFlags =
prevFlags?.clipboardRead === flags.clipboardRead &&
prevFlags?.clipboardWrite === flags.clipboardWrite &&
prevFlags?.systemKeyCombos === flags.systemKeyCombos
return sameApps && sameFlags
? prev
: {
...prev,
computerUseMcpState: {
...cu,
allowedApps: [...apps],
grantFlags: flags,
},
}
}),
onAppsHidden: ids => {
if (ids.length === 0) return
tuc().setAppState(prev => {
const cu = prev.computerUseMcpState
const existing = cu?.hiddenDuringTurn
if (existing && ids.every(id => existing.has(id))) return prev
return {
...prev,
computerUseMcpState: {
...cu,
hiddenDuringTurn: new Set([...(existing ?? []), ...ids]),
},
}
})
},
// Resolver writeback only fires under a pin when Swift fell back to main
// (pinned display unplugged) — the pin is semantically dead, so clear it
// and the app-set key so the chase chain runs next time. When autoResolve
// was true, onDisplayResolvedForApps re-sets the key in the same tick.
onResolvedDisplayUpdated: id =>
tuc().setAppState(prev => {
const cu = prev.computerUseMcpState
if (
cu?.selectedDisplayId === id &&
!cu.displayPinnedByModel &&
cu.displayResolvedForApps === undefined
) {
return prev
}
return {
...prev,
computerUseMcpState: {
...cu,
selectedDisplayId: id,
displayPinnedByModel: false,
displayResolvedForApps: undefined,
},
}
}),
// switch_display(name) pins; switch_display("auto") unpins and clears the
// app-set key so the next screenshot auto-resolves fresh.
onDisplayPinned: id =>
tuc().setAppState(prev => {
const cu = prev.computerUseMcpState
const pinned = id !== undefined
const nextResolvedFor = pinned ? cu?.displayResolvedForApps : undefined
if (
cu?.selectedDisplayId === id &&
cu?.displayPinnedByModel === pinned &&
cu?.displayResolvedForApps === nextResolvedFor
) {
return prev
}
return {
...prev,
computerUseMcpState: {
...cu,
selectedDisplayId: id,
displayPinnedByModel: pinned,
displayResolvedForApps: nextResolvedFor,
},
}
}),
onDisplayResolvedForApps: key =>
tuc().setAppState(prev => {
const cu = prev.computerUseMcpState
if (cu?.displayResolvedForApps === key) return prev
return {
...prev,
computerUseMcpState: { ...cu, displayResolvedForApps: key },
}
}),
onScreenshotCaptured: dims =>
tuc().setAppState(prev => {
const cu = prev.computerUseMcpState
const p = cu?.lastScreenshotDims
return p?.width === dims.width &&
p?.height === dims.height &&
p?.displayWidth === dims.displayWidth &&
p?.displayHeight === dims.displayHeight &&
p?.displayId === dims.displayId &&
p?.originX === dims.originX &&
p?.originY === dims.originY
? prev
: {
...prev,
computerUseMcpState: { ...cu, lastScreenshotDims: dims },
}
}),
// ── Lock — async, direct file-lock calls ───────────────────────────────
// No `lockHolderForGate` dance: the package's gate is async now. It
// awaits `checkCuLock`, and on `holder: undefined` + non-deferring tool
// awaits `acquireCuLock`. `defersLockAcquire` is the PACKAGE's set —
// the local copy is gone.
checkCuLock: async () => {
const c = await checkComputerUseLock()
switch (c.kind) {
case 'free':
return { holder: undefined, isSelf: false }
case 'held_by_self':
return { holder: getSessionId(), isSelf: true }
case 'blocked':
return { holder: c.by, isSelf: false }
}
},
// Called only when checkCuLock returned `holder: undefined`. The O_EXCL
// acquire is atomic — if another process grabbed it in the gap (rare),
// throw so the tool fails instead of proceeding without the lock.
// `fresh: false` (re-entrant) shouldn't happen given check said free,
// but is possible under parallel tool-use interleaving — don't spam the
// notification in that case.
acquireCuLock: async () => {
const r = await tryAcquireComputerUseLock()
if (r.kind === 'blocked') {
throw new Error(formatLockHeld(r.by))
}
if (r.fresh) {
// Global Escape → abort. Consumes the event (PI defense — prompt
// injection can't dismiss dialogs with Escape). The CGEventTap's
// CFRunLoopSource is processed by the drainRunLoop pump, so this
// holds a pump retain until unregisterEscHotkey() in cleanup.ts.
const escRegistered = registerEscHotkey(() => {
logForDebugging('[cu-esc] user escape, aborting turn')
tuc().abortController.abort()
})
tuc().sendOSNotification?.({
message: escRegistered
? 'Claude is using your computer · press Esc to stop'
: 'Claude is using your computer · press Ctrl+C to stop',
notificationType: 'computer_use_enter',
})
}
},
formatLockHeldMessage: formatLockHeld,
}
}
function getOrBind(): Binding {
if (binding) return binding
const ctx = buildSessionContext()
binding = {
ctx,
dispatch: bindSessionContext(
getComputerUseHostAdapter(),
getChicagoCoordinateMode(),
ctx,
),
}
return binding
}
/**
* Returns the full override object for a single `mcp__computer-use__{toolName}`
* tool: rendering overrides from `toolRendering.tsx` plus a `.call()` that
* dispatches through the cached binder.
*/
type ComputerUseMCPToolOverrides = ReturnType<
typeof getComputerUseMCPRenderingOverrides
> & {
call: CallOverride
}
export function getComputerUseMCPToolOverrides(
toolName: string,
): ComputerUseMCPToolOverrides {
const call: CallOverride = async (args, context: ToolUseContext) => {
currentToolUseContext = context
const { dispatch } = getOrBind()
const { telemetry, ...result } = await dispatch(toolName, args)
if (telemetry?.error_kind) {
logForDebugging(
`[Computer Use MCP] ${toolName} error_kind=${telemetry.error_kind}`,
)
}
// MCP content blocks → Anthropic API blocks. CU only produces text and
// pre-sized JPEG (executor.ts computeTargetDims → targetImageSize), so
// unlike the generic MCP path there's no resize needed — the MCP image
// shape just maps to the API's base64-source shape. The package's result
// type admits audio/resource too, but CU's handleToolCall never emits
// those; the fallthrough coerces them to empty text.
const data = Array.isArray(result.content)
? result.content.map(item =>
item.type === 'image'
? {
type: 'image' as const,
source: {
type: 'base64' as const,
media_type: item.mimeType ?? detectImageFormatFromBase64(item.data),
data: item.data,
},
}
: {
type: 'text' as const,
text: item.type === 'text' ? item.text : '',
},
)
: result.content
return { data }
}
return {
...getComputerUseMCPRenderingOverrides(toolName),
call,
}
}
/**
* Render the approval dialog mid-call via `setToolJSX` + `Promise`, wait for
* the user. Mirrors `spawnMultiAgent.ts:419-436` (the `It2SetupPrompt` pattern).
*
* The merge-into-AppState that used to live here (dedupe + truthy-only flags)
* is now in the package's `bindSessionContext` → `onAllowedAppsChanged`.
*/
async function runPermissionDialog(
req: CuPermissionRequest,
): Promise<CuPermissionResponse> {
const context = tuc()
const setToolJSX = context.setToolJSX
if (!setToolJSX) {
// Shouldn't happen — main.tsx gate excludes non-interactive. Fail safe.
return { granted: [], denied: [], flags: DEFAULT_GRANT_FLAGS }
}
try {
return await new Promise<CuPermissionResponse>((resolve, reject) => {
const signal = context.abortController.signal
// If already aborted, addEventListener won't fire — reject now so the
// promise doesn't hang waiting for a user who Ctrl+C'd.
if (signal.aborted) {
reject(new Error('Computer Use permission dialog aborted'))
return
}
const onAbort = (): void => {
signal.removeEventListener('abort', onAbort)
reject(new Error('Computer Use permission dialog aborted'))
}
signal.addEventListener('abort', onAbort)
setToolJSX({
jsx: React.createElement(ComputerUseApproval, {
request: req,
onDone: (resp: CuPermissionResponse) => {
signal.removeEventListener('abort', onAbort)
resolve(resp)
},
}),
shouldHidePromptInput: true,
})
})
} finally {
setToolJSX(null)
}
}