From ed619327482e24710cd05662c15c8220fc67d0c8 Mon Sep 17 00:00:00 2001 From: claude-code-best Date: Fri, 22 May 2026 21:52:58 +0800 Subject: [PATCH] fix: subtract cached_tokens from input_tokens in OpenAI stream adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OpenAI's prompt_tokens includes cached tokens, but Anthropic's input_tokens semantic excludes them. The adapter was mapping prompt_tokens → input_tokens verbatim, causing downstream code (cache hit rate, cost, autocompact) to double-count. Real-world impact: DeepSeek returns prompt_tokens=34097 with cached_tokens=34048, displayed as 50% hit rate instead of 99.86%. Co-Authored-By: glm-5.1 --- .../__tests__/openaiStreamAdapter.test.ts | 35 +++++++++++++++++-- .../src/shared/openaiStreamAdapter.ts | 26 +++++++++----- 2 files changed, 49 insertions(+), 12 deletions(-) diff --git a/packages/@ant/model-provider/src/shared/__tests__/openaiStreamAdapter.test.ts b/packages/@ant/model-provider/src/shared/__tests__/openaiStreamAdapter.test.ts index fef15c358..8e8e15000 100644 --- a/packages/@ant/model-provider/src/shared/__tests__/openaiStreamAdapter.test.ts +++ b/packages/@ant/model-provider/src/shared/__tests__/openaiStreamAdapter.test.ts @@ -551,7 +551,8 @@ describe('prompt caching support', () => { const msgStart = events.find(e => e.type === 'message_start') as any expect(msgStart.message.usage.cache_read_input_tokens).toBe(800) - expect(msgStart.message.usage.input_tokens).toBe(1000) + // input_tokens = prompt_tokens - cached_tokens = 1000 - 800 = 200 + expect(msgStart.message.usage.input_tokens).toBe(200) }) test('defaults cache_read_input_tokens to 0 when no cached_tokens', async () => { @@ -750,7 +751,8 @@ describe('prompt caching support', () => { // message_delta carries the real values from the trailing chunk const msgDelta = events.find(e => e.type === 'message_delta') as any - expect(msgDelta.usage.input_tokens).toBe(30011) + // input_tokens = prompt_tokens - cached_tokens = 30011 - 19904 = 10107 + expect(msgDelta.usage.input_tokens).toBe(10107) expect(msgDelta.usage.output_tokens).toBe(190) expect(msgDelta.usage.cache_read_input_tokens).toBe(19904) expect(msgDelta.usage.cache_creation_input_tokens).toBe(0) @@ -821,7 +823,34 @@ describe('prompt caching support', () => { const msgDelta = events.find(e => e.type === 'message_delta') as any expect(msgDelta.usage.cache_read_input_tokens).toBe(1500) - expect(msgDelta.usage.input_tokens).toBe(2000) + // input_tokens = prompt_tokens - cached_tokens = 2000 - 1500 = 500 + expect(msgDelta.usage.input_tokens).toBe(500) expect(msgDelta.usage.output_tokens).toBe(100) }) + + test('subtracts cached_tokens from input_tokens to match Anthropic semantic', async () => { + // Anthropic's input_tokens = non-cached tokens only. + // OpenAI's prompt_tokens = total input including cached. + // The adapter must subtract: input_tokens = prompt_tokens - cached_tokens. + const events = await collectEvents([ + makeChunk({ + choices: [{ index: 0, delta: { content: 'hi' }, finish_reason: null }], + }), + makeChunk({ + choices: [{ index: 0, delta: {}, finish_reason: 'stop' }], + usage: { + prompt_tokens: 34097, + completion_tokens: 30, + total_tokens: 34127, + prompt_tokens_details: { cached_tokens: 34048 }, + } as any, + }), + ]) + + const msgDelta = events.find(e => e.type === 'message_delta') as any + // input_tokens = 34097 - 34048 = 49 (non-cached input only) + expect(msgDelta.usage.input_tokens).toBe(49) + expect(msgDelta.usage.cache_read_input_tokens).toBe(34048) + expect(msgDelta.usage.output_tokens).toBe(30) + }) }) diff --git a/packages/@ant/model-provider/src/shared/openaiStreamAdapter.ts b/packages/@ant/model-provider/src/shared/openaiStreamAdapter.ts index 02e32d957..8a14c58db 100644 --- a/packages/@ant/model-provider/src/shared/openaiStreamAdapter.ts +++ b/packages/@ant/model-provider/src/shared/openaiStreamAdapter.ts @@ -13,10 +13,10 @@ import { randomUUID } from 'crypto' * finish_reason → message_delta(stop_reason) + message_stop * * Usage field mapping (OpenAI → Anthropic): - * prompt_tokens → input_tokens - * completion_tokens → output_tokens - * prompt_tokens_details.cached_tokens → cache_read_input_tokens - * (no OpenAI equivalent) → cache_creation_input_tokens (always 0) + * prompt_tokens - cached_tokens → input_tokens (non-cached input only) + * completion_tokens → output_tokens + * prompt_tokens_details.cached_tokens → cache_read_input_tokens + * (no OpenAI equivalent) → cache_creation_input_tokens (always 0) * * All four fields are emitted in the post-loop message_delta (not message_start) * so that trailing usage chunks (sent after finish_reason by some @@ -54,6 +54,9 @@ export async function* adaptOpenAIStreamToAnthropic( let textBlockOpen = false // Track usage — all four Anthropic fields, populated from OpenAI usage fields: + // rawInputTokens tracks the raw prompt_tokens (OpenAI total, including cached). + // inputTokens is the derived Anthropic value (non-cached only = rawInputTokens - cachedReadTokens). + let rawInputTokens = 0 let inputTokens = 0 let outputTokens = 0 let cachedReadTokens = 0 @@ -71,12 +74,17 @@ export async function* adaptOpenAIStreamToAnthropic( // Extract usage from any chunk that carries it. if (chunk.usage) { - inputTokens = chunk.usage.prompt_tokens ?? inputTokens + rawInputTokens = chunk.usage.prompt_tokens ?? rawInputTokens + const rawCached = + ((chunk.usage as any).prompt_tokens_details?.cached_tokens as + | number + | undefined) ?? cachedReadTokens + // Anthropic's input_tokens = non-cached input only. OpenAI's prompt_tokens + // includes cached tokens, so subtract. Clamp to 0 in case cached > total + // due to a streaming race. + inputTokens = Math.max(0, rawInputTokens - rawCached) outputTokens = chunk.usage.completion_tokens ?? outputTokens - const details = (chunk.usage as any).prompt_tokens_details - if (details?.cached_tokens != null) { - cachedReadTokens = details.cached_tokens - } + cachedReadTokens = rawCached } // Emit message_start on first chunk