mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-15 12:55:51 +00:00
macOS screencapture 输出 PNG,但代码硬编码 mimeType 为 image/jpeg, 导致 API 报错 "specified using image/jpeg but appears to be image/png"。 改为通过 magic bytes 检测实际图片格式。 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
400 lines
15 KiB
TypeScript
400 lines
15 KiB
TypeScript
/**
|
|
* The `.call()` override — thin adapter between `ToolUseContext` and
|
|
* `bindSessionContext`. Spread into the MCP tool object in `client.ts`
|
|
* (same pattern as Chrome's rendering overrides, plus `.call()`).
|
|
*
|
|
* The wrapper-closure logic (build overrides fresh, lock gate, permission
|
|
* merge, screenshot stash) lives in `@ant/computer-use-mcp`'s
|
|
* `bindSessionContext`. This file binds it once per process,
|
|
* caches the dispatcher, and updates a per-call ref for the pieces of
|
|
* `ToolUseContext` that vary per-call (`abortController`, `setToolJSX`,
|
|
* `sendOSNotification`). AppState accessors are read through the ref too —
|
|
* they're likely stable but we don't depend on that.
|
|
*
|
|
* External callers reach this via the lazy require thunk in `client.ts`, gated
|
|
* on `feature('CHICAGO_MCP')`. Runtime enablement is controlled by the
|
|
* GrowthBook gate `tengu_malort_pedway` (see gates.ts).
|
|
*/
|
|
|
|
import {
|
|
bindSessionContext,
|
|
type ComputerUseSessionContext,
|
|
type CuCallToolResult,
|
|
type CuPermissionRequest,
|
|
type CuPermissionResponse,
|
|
DEFAULT_GRANT_FLAGS,
|
|
type ScreenshotDims,
|
|
} from '@ant/computer-use-mcp'
|
|
import * as React from 'react'
|
|
import { getSessionId } from '../../bootstrap/state.js'
|
|
import { ComputerUseApproval } from '../../components/permissions/ComputerUseApproval/ComputerUseApproval.js'
|
|
import type { Tool, ToolUseContext } from '../../Tool.js'
|
|
import { logForDebugging } from '../debug.js'
|
|
import { detectImageFormatFromBase64 } from '../imageResizer.js'
|
|
import {
|
|
checkComputerUseLock,
|
|
tryAcquireComputerUseLock,
|
|
} from './computerUseLock.js'
|
|
import { registerEscHotkey } from './escHotkey.js'
|
|
import { getChicagoCoordinateMode } from './gates.js'
|
|
import { getComputerUseHostAdapter } from './hostAdapter.js'
|
|
import { getComputerUseMCPRenderingOverrides } from './toolRendering.js'
|
|
|
|
type CallOverride = Pick<Tool, 'call'>['call']
|
|
|
|
type Binding = {
|
|
ctx: ComputerUseSessionContext
|
|
dispatch: (name: string, args: unknown) => Promise<CuCallToolResult>
|
|
}
|
|
|
|
/**
|
|
* Cached binding — built on first `.call()`, reused for process lifetime.
|
|
* The dispatcher's closure-held screenshot blob persists across calls.
|
|
*
|
|
* `currentToolUseContext` is updated on every call. Every getter/callback in
|
|
* `ctx` reads through it, so the per-call pieces (`abortController`,
|
|
* `setToolJSX`, `sendOSNotification`) are always current.
|
|
*
|
|
* Module-level `let` is a deliberate exception to the no-module-scope-state
|
|
* rule (src/CLAUDE.md): the dispatcher closure must persist across calls so
|
|
* its internal screenshot blob survives, but `ToolUseContext` is per-call.
|
|
* Tests will need to either inject the cache or run serially.
|
|
*/
|
|
let binding: Binding | undefined
|
|
let currentToolUseContext: ToolUseContext | undefined
|
|
|
|
function tuc(): ToolUseContext {
|
|
// Safe: `binding` is only populated when `currentToolUseContext` is set.
|
|
// Called only from within `ctx` callbacks, which only fire during dispatch.
|
|
return currentToolUseContext!
|
|
}
|
|
|
|
function formatLockHeld(holder: string): string {
|
|
return `Computer use is in use by another Claude session (${holder.slice(0, 8)}…). Wait for that session to finish or run /exit there.`
|
|
}
|
|
|
|
export function buildSessionContext(): ComputerUseSessionContext {
|
|
return {
|
|
// ── Read state fresh via the per-call ref ─────────────────────────────
|
|
getAllowedApps: () =>
|
|
tuc().getAppState().computerUseMcpState?.allowedApps ?? [],
|
|
getGrantFlags: () =>
|
|
tuc().getAppState().computerUseMcpState?.grantFlags ??
|
|
DEFAULT_GRANT_FLAGS,
|
|
// cc-2 has no Settings page for user-denied apps yet.
|
|
getUserDeniedBundleIds: () => [],
|
|
getSelectedDisplayId: () =>
|
|
tuc().getAppState().computerUseMcpState?.selectedDisplayId,
|
|
getDisplayPinnedByModel: () =>
|
|
tuc().getAppState().computerUseMcpState?.displayPinnedByModel ?? false,
|
|
getDisplayResolvedForApps: () =>
|
|
tuc().getAppState().computerUseMcpState?.displayResolvedForApps,
|
|
getLastScreenshotDims: (): ScreenshotDims | undefined => {
|
|
const d = tuc().getAppState().computerUseMcpState?.lastScreenshotDims
|
|
return d
|
|
? {
|
|
...d,
|
|
displayId: d.displayId ?? 0,
|
|
originX: d.originX ?? 0,
|
|
originY: d.originY ?? 0,
|
|
}
|
|
: undefined
|
|
},
|
|
|
|
// ── Write-backs ────────────────────────────────────────────────────────
|
|
// `setToolJSX` is guaranteed present — the gate in `main.tsx` excludes
|
|
// non-interactive sessions. The package's `_dialogSignal` (tool-finished
|
|
// dismissal) is irrelevant here: `setToolJSX` blocks the tool call, so
|
|
// the dialog can't outlive it. Ctrl+C is what matters, and
|
|
// `runPermissionDialog` wires that from the per-call ref's abortController.
|
|
onPermissionRequest: (req, _dialogSignal) => runPermissionDialog(req),
|
|
|
|
// Package does the merge (dedupe + truthy-only flags). We just persist.
|
|
onAllowedAppsChanged: (apps, flags) =>
|
|
tuc().setAppState(prev => {
|
|
const cu = prev.computerUseMcpState
|
|
const prevApps = cu?.allowedApps
|
|
const prevFlags = cu?.grantFlags
|
|
const sameApps =
|
|
prevApps?.length === apps.length &&
|
|
apps.every((a, i) => prevApps[i]?.bundleId === a.bundleId)
|
|
const sameFlags =
|
|
prevFlags?.clipboardRead === flags.clipboardRead &&
|
|
prevFlags?.clipboardWrite === flags.clipboardWrite &&
|
|
prevFlags?.systemKeyCombos === flags.systemKeyCombos
|
|
return sameApps && sameFlags
|
|
? prev
|
|
: {
|
|
...prev,
|
|
computerUseMcpState: {
|
|
...cu,
|
|
allowedApps: [...apps],
|
|
grantFlags: flags,
|
|
},
|
|
}
|
|
}),
|
|
|
|
onAppsHidden: ids => {
|
|
if (ids.length === 0) return
|
|
tuc().setAppState(prev => {
|
|
const cu = prev.computerUseMcpState
|
|
const existing = cu?.hiddenDuringTurn
|
|
if (existing && ids.every(id => existing.has(id))) return prev
|
|
return {
|
|
...prev,
|
|
computerUseMcpState: {
|
|
...cu,
|
|
hiddenDuringTurn: new Set([...(existing ?? []), ...ids]),
|
|
},
|
|
}
|
|
})
|
|
},
|
|
|
|
// Resolver writeback only fires under a pin when Swift fell back to main
|
|
// (pinned display unplugged) — the pin is semantically dead, so clear it
|
|
// and the app-set key so the chase chain runs next time. When autoResolve
|
|
// was true, onDisplayResolvedForApps re-sets the key in the same tick.
|
|
onResolvedDisplayUpdated: id =>
|
|
tuc().setAppState(prev => {
|
|
const cu = prev.computerUseMcpState
|
|
if (
|
|
cu?.selectedDisplayId === id &&
|
|
!cu.displayPinnedByModel &&
|
|
cu.displayResolvedForApps === undefined
|
|
) {
|
|
return prev
|
|
}
|
|
return {
|
|
...prev,
|
|
computerUseMcpState: {
|
|
...cu,
|
|
selectedDisplayId: id,
|
|
displayPinnedByModel: false,
|
|
displayResolvedForApps: undefined,
|
|
},
|
|
}
|
|
}),
|
|
|
|
// switch_display(name) pins; switch_display("auto") unpins and clears the
|
|
// app-set key so the next screenshot auto-resolves fresh.
|
|
onDisplayPinned: id =>
|
|
tuc().setAppState(prev => {
|
|
const cu = prev.computerUseMcpState
|
|
const pinned = id !== undefined
|
|
const nextResolvedFor = pinned ? cu?.displayResolvedForApps : undefined
|
|
if (
|
|
cu?.selectedDisplayId === id &&
|
|
cu?.displayPinnedByModel === pinned &&
|
|
cu?.displayResolvedForApps === nextResolvedFor
|
|
) {
|
|
return prev
|
|
}
|
|
return {
|
|
...prev,
|
|
computerUseMcpState: {
|
|
...cu,
|
|
selectedDisplayId: id,
|
|
displayPinnedByModel: pinned,
|
|
displayResolvedForApps: nextResolvedFor,
|
|
},
|
|
}
|
|
}),
|
|
|
|
onDisplayResolvedForApps: key =>
|
|
tuc().setAppState(prev => {
|
|
const cu = prev.computerUseMcpState
|
|
if (cu?.displayResolvedForApps === key) return prev
|
|
return {
|
|
...prev,
|
|
computerUseMcpState: { ...cu, displayResolvedForApps: key },
|
|
}
|
|
}),
|
|
|
|
onScreenshotCaptured: dims =>
|
|
tuc().setAppState(prev => {
|
|
const cu = prev.computerUseMcpState
|
|
const p = cu?.lastScreenshotDims
|
|
return p?.width === dims.width &&
|
|
p?.height === dims.height &&
|
|
p?.displayWidth === dims.displayWidth &&
|
|
p?.displayHeight === dims.displayHeight &&
|
|
p?.displayId === dims.displayId &&
|
|
p?.originX === dims.originX &&
|
|
p?.originY === dims.originY
|
|
? prev
|
|
: {
|
|
...prev,
|
|
computerUseMcpState: { ...cu, lastScreenshotDims: dims },
|
|
}
|
|
}),
|
|
|
|
// ── Lock — async, direct file-lock calls ───────────────────────────────
|
|
// No `lockHolderForGate` dance: the package's gate is async now. It
|
|
// awaits `checkCuLock`, and on `holder: undefined` + non-deferring tool
|
|
// awaits `acquireCuLock`. `defersLockAcquire` is the PACKAGE's set —
|
|
// the local copy is gone.
|
|
checkCuLock: async () => {
|
|
const c = await checkComputerUseLock()
|
|
switch (c.kind) {
|
|
case 'free':
|
|
return { holder: undefined, isSelf: false }
|
|
case 'held_by_self':
|
|
return { holder: getSessionId(), isSelf: true }
|
|
case 'blocked':
|
|
return { holder: c.by, isSelf: false }
|
|
}
|
|
},
|
|
|
|
// Called only when checkCuLock returned `holder: undefined`. The O_EXCL
|
|
// acquire is atomic — if another process grabbed it in the gap (rare),
|
|
// throw so the tool fails instead of proceeding without the lock.
|
|
// `fresh: false` (re-entrant) shouldn't happen given check said free,
|
|
// but is possible under parallel tool-use interleaving — don't spam the
|
|
// notification in that case.
|
|
acquireCuLock: async () => {
|
|
const r = await tryAcquireComputerUseLock()
|
|
if (r.kind === 'blocked') {
|
|
throw new Error(formatLockHeld(r.by))
|
|
}
|
|
if (r.fresh) {
|
|
// Global Escape → abort. Consumes the event (PI defense — prompt
|
|
// injection can't dismiss dialogs with Escape). The CGEventTap's
|
|
// CFRunLoopSource is processed by the drainRunLoop pump, so this
|
|
// holds a pump retain until unregisterEscHotkey() in cleanup.ts.
|
|
const escRegistered = registerEscHotkey(() => {
|
|
logForDebugging('[cu-esc] user escape, aborting turn')
|
|
tuc().abortController.abort()
|
|
})
|
|
tuc().sendOSNotification?.({
|
|
message: escRegistered
|
|
? 'Claude is using your computer · press Esc to stop'
|
|
: 'Claude is using your computer · press Ctrl+C to stop',
|
|
notificationType: 'computer_use_enter',
|
|
})
|
|
}
|
|
},
|
|
|
|
formatLockHeldMessage: formatLockHeld,
|
|
}
|
|
}
|
|
|
|
function getOrBind(): Binding {
|
|
if (binding) return binding
|
|
const ctx = buildSessionContext()
|
|
binding = {
|
|
ctx,
|
|
dispatch: bindSessionContext(
|
|
getComputerUseHostAdapter(),
|
|
getChicagoCoordinateMode(),
|
|
ctx,
|
|
),
|
|
}
|
|
return binding
|
|
}
|
|
|
|
/**
|
|
* Returns the full override object for a single `mcp__computer-use__{toolName}`
|
|
* tool: rendering overrides from `toolRendering.tsx` plus a `.call()` that
|
|
* dispatches through the cached binder.
|
|
*/
|
|
type ComputerUseMCPToolOverrides = ReturnType<
|
|
typeof getComputerUseMCPRenderingOverrides
|
|
> & {
|
|
call: CallOverride
|
|
}
|
|
|
|
export function getComputerUseMCPToolOverrides(
|
|
toolName: string,
|
|
): ComputerUseMCPToolOverrides {
|
|
const call: CallOverride = async (args, context: ToolUseContext) => {
|
|
currentToolUseContext = context
|
|
const { dispatch } = getOrBind()
|
|
|
|
const { telemetry, ...result } = await dispatch(toolName, args)
|
|
|
|
if (telemetry?.error_kind) {
|
|
logForDebugging(
|
|
`[Computer Use MCP] ${toolName} error_kind=${telemetry.error_kind}`,
|
|
)
|
|
}
|
|
|
|
// MCP content blocks → Anthropic API blocks. CU only produces text and
|
|
// pre-sized JPEG (executor.ts computeTargetDims → targetImageSize), so
|
|
// unlike the generic MCP path there's no resize needed — the MCP image
|
|
// shape just maps to the API's base64-source shape. The package's result
|
|
// type admits audio/resource too, but CU's handleToolCall never emits
|
|
// those; the fallthrough coerces them to empty text.
|
|
const data = Array.isArray(result.content)
|
|
? result.content.map(item =>
|
|
item.type === 'image'
|
|
? {
|
|
type: 'image' as const,
|
|
source: {
|
|
type: 'base64' as const,
|
|
media_type: item.mimeType ?? detectImageFormatFromBase64(item.data),
|
|
data: item.data,
|
|
},
|
|
}
|
|
: {
|
|
type: 'text' as const,
|
|
text: item.type === 'text' ? item.text : '',
|
|
},
|
|
)
|
|
: result.content
|
|
return { data }
|
|
}
|
|
|
|
return {
|
|
...getComputerUseMCPRenderingOverrides(toolName),
|
|
call,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Render the approval dialog mid-call via `setToolJSX` + `Promise`, wait for
|
|
* the user. Mirrors `spawnMultiAgent.ts:419-436` (the `It2SetupPrompt` pattern).
|
|
*
|
|
* The merge-into-AppState that used to live here (dedupe + truthy-only flags)
|
|
* is now in the package's `bindSessionContext` → `onAllowedAppsChanged`.
|
|
*/
|
|
async function runPermissionDialog(
|
|
req: CuPermissionRequest,
|
|
): Promise<CuPermissionResponse> {
|
|
const context = tuc()
|
|
const setToolJSX = context.setToolJSX
|
|
if (!setToolJSX) {
|
|
// Shouldn't happen — main.tsx gate excludes non-interactive. Fail safe.
|
|
return { granted: [], denied: [], flags: DEFAULT_GRANT_FLAGS }
|
|
}
|
|
|
|
try {
|
|
return await new Promise<CuPermissionResponse>((resolve, reject) => {
|
|
const signal = context.abortController.signal
|
|
// If already aborted, addEventListener won't fire — reject now so the
|
|
// promise doesn't hang waiting for a user who Ctrl+C'd.
|
|
if (signal.aborted) {
|
|
reject(new Error('Computer Use permission dialog aborted'))
|
|
return
|
|
}
|
|
const onAbort = (): void => {
|
|
signal.removeEventListener('abort', onAbort)
|
|
reject(new Error('Computer Use permission dialog aborted'))
|
|
}
|
|
signal.addEventListener('abort', onAbort)
|
|
|
|
setToolJSX({
|
|
jsx: React.createElement(ComputerUseApproval, {
|
|
request: req,
|
|
onDone: (resp: CuPermissionResponse) => {
|
|
signal.removeEventListener('abort', onAbort)
|
|
resolve(resp)
|
|
},
|
|
}),
|
|
shouldHidePromptInput: true,
|
|
})
|
|
})
|
|
} finally {
|
|
setToolJSX(null)
|
|
}
|
|
}
|