fix: subtract cached_tokens from input_tokens in OpenAI stream adapter

OpenAI's prompt_tokens includes cached tokens, but Anthropic's
input_tokens semantic excludes them. The adapter was mapping
prompt_tokens → input_tokens verbatim, causing downstream code
(cache hit rate, cost, autocompact) to double-count.

Real-world impact: DeepSeek returns prompt_tokens=34097 with
cached_tokens=34048, displayed as 50% hit rate instead of 99.86%.

Co-Authored-By: glm-5.1 <zai-org@claude-code-best.win>
This commit is contained in:
claude-code-best
2026-05-22 21:52:58 +08:00
parent b1c4f40f90
commit ed61932748
2 changed files with 49 additions and 12 deletions

View File

@@ -13,10 +13,10 @@ import { randomUUID } from 'crypto'
* finish_reason → message_delta(stop_reason) + message_stop
*
* Usage field mapping (OpenAI → Anthropic):
* prompt_tokens → input_tokens
* completion_tokens → output_tokens
* prompt_tokens_details.cached_tokens → cache_read_input_tokens
* (no OpenAI equivalent) → cache_creation_input_tokens (always 0)
* prompt_tokens - cached_tokens → input_tokens (non-cached input only)
* completion_tokens → output_tokens
* prompt_tokens_details.cached_tokens → cache_read_input_tokens
* (no OpenAI equivalent) → cache_creation_input_tokens (always 0)
*
* All four fields are emitted in the post-loop message_delta (not message_start)
* so that trailing usage chunks (sent after finish_reason by some
@@ -54,6 +54,9 @@ export async function* adaptOpenAIStreamToAnthropic(
let textBlockOpen = false
// Track usage — all four Anthropic fields, populated from OpenAI usage fields:
// rawInputTokens tracks the raw prompt_tokens (OpenAI total, including cached).
// inputTokens is the derived Anthropic value (non-cached only = rawInputTokens - cachedReadTokens).
let rawInputTokens = 0
let inputTokens = 0
let outputTokens = 0
let cachedReadTokens = 0
@@ -71,12 +74,17 @@ export async function* adaptOpenAIStreamToAnthropic(
// Extract usage from any chunk that carries it.
if (chunk.usage) {
inputTokens = chunk.usage.prompt_tokens ?? inputTokens
rawInputTokens = chunk.usage.prompt_tokens ?? rawInputTokens
const rawCached =
((chunk.usage as any).prompt_tokens_details?.cached_tokens as
| number
| undefined) ?? cachedReadTokens
// Anthropic's input_tokens = non-cached input only. OpenAI's prompt_tokens
// includes cached tokens, so subtract. Clamp to 0 in case cached > total
// due to a streaming race.
inputTokens = Math.max(0, rawInputTokens - rawCached)
outputTokens = chunk.usage.completion_tokens ?? outputTokens
const details = (chunk.usage as any).prompt_tokens_details
if (details?.cached_tokens != null) {
cachedReadTokens = details.cached_tokens
}
cachedReadTokens = rawCached
}
// Emit message_start on first chunk