mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-15 12:55:51 +00:00
fix: subtract cached_tokens from input_tokens in OpenAI stream adapter
OpenAI's prompt_tokens includes cached tokens, but Anthropic's input_tokens semantic excludes them. The adapter was mapping prompt_tokens → input_tokens verbatim, causing downstream code (cache hit rate, cost, autocompact) to double-count. Real-world impact: DeepSeek returns prompt_tokens=34097 with cached_tokens=34048, displayed as 50% hit rate instead of 99.86%. Co-Authored-By: glm-5.1 <zai-org@claude-code-best.win>
This commit is contained in:
@@ -551,7 +551,8 @@ describe('prompt caching support', () => {
|
|||||||
|
|
||||||
const msgStart = events.find(e => e.type === 'message_start') as any
|
const msgStart = events.find(e => e.type === 'message_start') as any
|
||||||
expect(msgStart.message.usage.cache_read_input_tokens).toBe(800)
|
expect(msgStart.message.usage.cache_read_input_tokens).toBe(800)
|
||||||
expect(msgStart.message.usage.input_tokens).toBe(1000)
|
// input_tokens = prompt_tokens - cached_tokens = 1000 - 800 = 200
|
||||||
|
expect(msgStart.message.usage.input_tokens).toBe(200)
|
||||||
})
|
})
|
||||||
|
|
||||||
test('defaults cache_read_input_tokens to 0 when no cached_tokens', async () => {
|
test('defaults cache_read_input_tokens to 0 when no cached_tokens', async () => {
|
||||||
@@ -750,7 +751,8 @@ describe('prompt caching support', () => {
|
|||||||
|
|
||||||
// message_delta carries the real values from the trailing chunk
|
// message_delta carries the real values from the trailing chunk
|
||||||
const msgDelta = events.find(e => e.type === 'message_delta') as any
|
const msgDelta = events.find(e => e.type === 'message_delta') as any
|
||||||
expect(msgDelta.usage.input_tokens).toBe(30011)
|
// input_tokens = prompt_tokens - cached_tokens = 30011 - 19904 = 10107
|
||||||
|
expect(msgDelta.usage.input_tokens).toBe(10107)
|
||||||
expect(msgDelta.usage.output_tokens).toBe(190)
|
expect(msgDelta.usage.output_tokens).toBe(190)
|
||||||
expect(msgDelta.usage.cache_read_input_tokens).toBe(19904)
|
expect(msgDelta.usage.cache_read_input_tokens).toBe(19904)
|
||||||
expect(msgDelta.usage.cache_creation_input_tokens).toBe(0)
|
expect(msgDelta.usage.cache_creation_input_tokens).toBe(0)
|
||||||
@@ -821,7 +823,34 @@ describe('prompt caching support', () => {
|
|||||||
|
|
||||||
const msgDelta = events.find(e => e.type === 'message_delta') as any
|
const msgDelta = events.find(e => e.type === 'message_delta') as any
|
||||||
expect(msgDelta.usage.cache_read_input_tokens).toBe(1500)
|
expect(msgDelta.usage.cache_read_input_tokens).toBe(1500)
|
||||||
expect(msgDelta.usage.input_tokens).toBe(2000)
|
// input_tokens = prompt_tokens - cached_tokens = 2000 - 1500 = 500
|
||||||
|
expect(msgDelta.usage.input_tokens).toBe(500)
|
||||||
expect(msgDelta.usage.output_tokens).toBe(100)
|
expect(msgDelta.usage.output_tokens).toBe(100)
|
||||||
})
|
})
|
||||||
|
|
||||||
|
test('subtracts cached_tokens from input_tokens to match Anthropic semantic', async () => {
|
||||||
|
// Anthropic's input_tokens = non-cached tokens only.
|
||||||
|
// OpenAI's prompt_tokens = total input including cached.
|
||||||
|
// The adapter must subtract: input_tokens = prompt_tokens - cached_tokens.
|
||||||
|
const events = await collectEvents([
|
||||||
|
makeChunk({
|
||||||
|
choices: [{ index: 0, delta: { content: 'hi' }, finish_reason: null }],
|
||||||
|
}),
|
||||||
|
makeChunk({
|
||||||
|
choices: [{ index: 0, delta: {}, finish_reason: 'stop' }],
|
||||||
|
usage: {
|
||||||
|
prompt_tokens: 34097,
|
||||||
|
completion_tokens: 30,
|
||||||
|
total_tokens: 34127,
|
||||||
|
prompt_tokens_details: { cached_tokens: 34048 },
|
||||||
|
} as any,
|
||||||
|
}),
|
||||||
|
])
|
||||||
|
|
||||||
|
const msgDelta = events.find(e => e.type === 'message_delta') as any
|
||||||
|
// input_tokens = 34097 - 34048 = 49 (non-cached input only)
|
||||||
|
expect(msgDelta.usage.input_tokens).toBe(49)
|
||||||
|
expect(msgDelta.usage.cache_read_input_tokens).toBe(34048)
|
||||||
|
expect(msgDelta.usage.output_tokens).toBe(30)
|
||||||
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -13,10 +13,10 @@ import { randomUUID } from 'crypto'
|
|||||||
* finish_reason → message_delta(stop_reason) + message_stop
|
* finish_reason → message_delta(stop_reason) + message_stop
|
||||||
*
|
*
|
||||||
* Usage field mapping (OpenAI → Anthropic):
|
* Usage field mapping (OpenAI → Anthropic):
|
||||||
* prompt_tokens → input_tokens
|
* prompt_tokens - cached_tokens → input_tokens (non-cached input only)
|
||||||
* completion_tokens → output_tokens
|
* completion_tokens → output_tokens
|
||||||
* prompt_tokens_details.cached_tokens → cache_read_input_tokens
|
* prompt_tokens_details.cached_tokens → cache_read_input_tokens
|
||||||
* (no OpenAI equivalent) → cache_creation_input_tokens (always 0)
|
* (no OpenAI equivalent) → cache_creation_input_tokens (always 0)
|
||||||
*
|
*
|
||||||
* All four fields are emitted in the post-loop message_delta (not message_start)
|
* All four fields are emitted in the post-loop message_delta (not message_start)
|
||||||
* so that trailing usage chunks (sent after finish_reason by some
|
* so that trailing usage chunks (sent after finish_reason by some
|
||||||
@@ -54,6 +54,9 @@ export async function* adaptOpenAIStreamToAnthropic(
|
|||||||
let textBlockOpen = false
|
let textBlockOpen = false
|
||||||
|
|
||||||
// Track usage — all four Anthropic fields, populated from OpenAI usage fields:
|
// Track usage — all four Anthropic fields, populated from OpenAI usage fields:
|
||||||
|
// rawInputTokens tracks the raw prompt_tokens (OpenAI total, including cached).
|
||||||
|
// inputTokens is the derived Anthropic value (non-cached only = rawInputTokens - cachedReadTokens).
|
||||||
|
let rawInputTokens = 0
|
||||||
let inputTokens = 0
|
let inputTokens = 0
|
||||||
let outputTokens = 0
|
let outputTokens = 0
|
||||||
let cachedReadTokens = 0
|
let cachedReadTokens = 0
|
||||||
@@ -71,12 +74,17 @@ export async function* adaptOpenAIStreamToAnthropic(
|
|||||||
|
|
||||||
// Extract usage from any chunk that carries it.
|
// Extract usage from any chunk that carries it.
|
||||||
if (chunk.usage) {
|
if (chunk.usage) {
|
||||||
inputTokens = chunk.usage.prompt_tokens ?? inputTokens
|
rawInputTokens = chunk.usage.prompt_tokens ?? rawInputTokens
|
||||||
|
const rawCached =
|
||||||
|
((chunk.usage as any).prompt_tokens_details?.cached_tokens as
|
||||||
|
| number
|
||||||
|
| undefined) ?? cachedReadTokens
|
||||||
|
// Anthropic's input_tokens = non-cached input only. OpenAI's prompt_tokens
|
||||||
|
// includes cached tokens, so subtract. Clamp to 0 in case cached > total
|
||||||
|
// due to a streaming race.
|
||||||
|
inputTokens = Math.max(0, rawInputTokens - rawCached)
|
||||||
outputTokens = chunk.usage.completion_tokens ?? outputTokens
|
outputTokens = chunk.usage.completion_tokens ?? outputTokens
|
||||||
const details = (chunk.usage as any).prompt_tokens_details
|
cachedReadTokens = rawCached
|
||||||
if (details?.cached_tokens != null) {
|
|
||||||
cachedReadTokens = details.cached_tokens
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Emit message_start on first chunk
|
// Emit message_start on first chunk
|
||||||
|
|||||||
Reference in New Issue
Block a user