Merge remote-tracking branch 'guunergooner/fix/openai-stop-reason-usage'

2026-06-22 00:05:51 +00:00 · 2026-04-10 22:27:16 +08:00
parent b060eabda9 c82f59943c
commit 7088fe3c8b
6 changed files with 865 additions and 61 deletions
--- a/src/services/api/openai/index.ts
+++ b/src/services/api/openai/index.ts
@@ -30,6 +30,7 @@ import { logForDebugging } from '../../../utils/debug.js'
 import { addToTotalSessionCost } from '../../../cost-tracker.js'
 import { calculateUSDCost } from '../../../utils/modelCost.js'
 import { isEnvTruthy, isEnvDefinedFalsy } from '../../../utils/envUtils.js'
+import { getModelMaxOutputTokens } from '../../../utils/context.js'
 import type { Options } from '../claude.js'
 import { randomUUID } from 'crypto'
 import {
@@ -87,16 +88,18 @@ export function buildOpenAIRequestBody(params: {
  tools: any[]
  toolChoice: any
  enableThinking: boolean
+  maxTokens: number
  temperatureOverride?: number
 }): ChatCompletionCreateParamsStreaming & {
  thinking?: { type: string }
  enable_thinking?: boolean
  chat_template_kwargs?: { thinking: boolean }
 } {
-  const { model, messages, tools, toolChoice, enableThinking, temperatureOverride } = params
+  const { model, messages, tools, toolChoice, enableThinking, maxTokens, temperatureOverride } = params
  return {
    model,
    messages,
+    max_tokens: maxTokens,
    ...(tools.length > 0 && {
      tools,
      ...(toolChoice && { tool_choice: toolChoice }),
@@ -120,6 +123,56 @@ export function buildOpenAIRequestBody(params: {
  }
 }

+/**
+ * Assemble the final AssistantMessage (and optional max_tokens error) from
+ * accumulated stream state. Extracted to avoid duplication between the
+ * `message_stop` handler and the post-loop safety fallback.
+ */
+function assembleFinalAssistantOutputs(params: {
+  partialMessage: any
+  contentBlocks: Record<number, any>
+  tools: Tools
+  agentId: string | undefined
+  usage: { input_tokens: number; output_tokens: number; cache_creation_input_tokens: number; cache_read_input_tokens: number }
+  stopReason: string | null
+  maxTokens: number
+}): (AssistantMessage | SystemAPIErrorMessage)[] {
+  const { partialMessage, contentBlocks, tools, agentId, usage, stopReason, maxTokens } = params
+  const outputs: (AssistantMessage | SystemAPIErrorMessage)[] = []
+
+  const allBlocks = Object.keys(contentBlocks)
+    .sort((a, b) => Number(a) - Number(b))
+    .map(k => contentBlocks[Number(k)])
+    .filter(Boolean)
+
+  if (allBlocks.length > 0) {
+    outputs.push({
+      message: {
+        ...partialMessage,
+        content: normalizeContentFromAPI(allBlocks, tools, agentId),
+        usage,
+        stop_reason: stopReason,
+        stop_sequence: null,
+      },
+      requestId: undefined,
+      type: 'assistant',
+      uuid: randomUUID(),
+      timestamp: new Date().toISOString(),
+    } as AssistantMessage)
+  }
+
+  if (stopReason === 'max_tokens') {
+    outputs.push(createAssistantAPIErrorMessage({
+      content: `Output truncated: response exceeded the ${maxTokens} token limit. ` +
+        `Set CLAUDE_CODE_MAX_OUTPUT_TOKENS to override.`,
+      apiError: 'max_output_tokens',
+      error: 'max_output_tokens',
+    }))
+  }
+
+  return outputs
+}
+
 /**
 * OpenAI-compatible query path. Converts Anthropic-format messages/tools to
 * OpenAI format, calls the OpenAI-compatible endpoint, and converts the
@@ -222,7 +275,20 @@ export async function* queryModelOpenAI(
      )
    }

-    // 10. Get client and make streaming request
+    // 10. Compute max_tokens — required by most OpenAI-compatible endpoints.
+    //     Without this the server uses a tiny default, and when
+    //     thinking is enabled the thinking phase consumes the entire budget
+    //     leaving no tokens for the final response.
+    //
+    //     Use upperLimit (not the slot-cap default) because the Anthropic path's
+    //     slot-reservation cap (CAPPED_DEFAULT_MAX_TOKENS=8k) is paired with an
+    //     auto-retry at 64k in query.ts. The OpenAI path has no such retry, so
+    //     using the capped 8k default would silently truncate responses in
+    //     multi-turn conversations where thinking consumes most of the budget.
+    const { upperLimit } = getModelMaxOutputTokens(openaiModel)
+    const maxTokens = options.maxOutputTokensOverride ?? upperLimit
+
+    // 11. Get client
    const client = getOpenAIClient({
      maxRetries: 0,
      fetchOverride: options.fetchOverride as unknown as typeof fetch,
@@ -233,13 +299,14 @@ export async function* queryModelOpenAI(
      `[OpenAI] Calling model=${openaiModel}, messages=${openaiMessages.length}, tools=${openaiTools.length}, thinking=${enableThinking}`,
    )

-    // 11. Call OpenAI API with streaming
+    // 12. Call OpenAI API with streaming
    const requestBody = buildOpenAIRequestBody({
      model: openaiModel,
      messages: openaiMessages,
      tools: openaiTools,
      toolChoice: openaiToolChoice,
      enableThinking,
+      maxTokens,
      temperatureOverride: options.temperatureOverride,
    })
    const stream = await client.chat.completions.create(
@@ -254,6 +321,7 @@ export async function* queryModelOpenAI(
    // Accumulate content blocks and usage, same as the Anthropic path in claude.ts
    const contentBlocks: Record<number, any> = {}
    let partialMessage: any
+    let stopReason: string | null = null
    let usage = {
      input_tokens: 0,
      output_tokens: 0,
@@ -307,21 +375,7 @@ export async function* queryModelOpenAI(
          break
        }
        case 'content_block_stop': {
-          const idx = (event as any).index
-          const block = contentBlocks[idx]
-          if (!block || !partialMessage) break
-
-          const m: AssistantMessage = {
-            message: {
-              ...partialMessage,
-              content: normalizeContentFromAPI([block], tools, options.agentId),
-            },
-            requestId: undefined,
-            type: 'assistant',
-            uuid: randomUUID(),
-            timestamp: new Date().toISOString(),
-          }
-          yield m
+          // Block accumulation is complete; assembly happens at message_stop.
          break
        }
        case 'message_delta': {
@@ -329,21 +383,33 @@ export async function* queryModelOpenAI(
          if (deltaUsage) {
            usage = { ...usage, ...deltaUsage }
          }
-          // Update the stop_reason on the last yielded message
-          // (we don't have a reference here, but the consumer handles this)
+          if ((event as any).delta?.stop_reason != null) {
+            stopReason = (event as any).delta.stop_reason
+          }
          break
        }
-        case 'message_stop':
+        case 'message_stop': {
+          // Assemble ONE AssistantMessage with ALL content blocks, matching the
+          // Anthropic SDK path. Real usage (input + output tokens) is available
+          // here and injected so tokenCountWithEstimation() can read it.
+          if (partialMessage) {
+            for (const output of assembleFinalAssistantOutputs({
+              partialMessage, contentBlocks, tools, agentId: options.agentId,
+              usage, stopReason, maxTokens,
+            })) {
+              yield output
+            }
+            // Reset partialMessage so the post-loop safety fallback does not
+            // yield a second identical AssistantMessage.
+            partialMessage = null
+          }
+          // Track cost and token usage
+          if (usage.input_tokens + usage.output_tokens > 0) {
+            const costUSD = calculateUSDCost(openaiModel, usage as any)
+            addToTotalSessionCost(costUSD, usage as any, options.model)
+          }
          break
-      }
-
-      // Track cost and token usage (matching the Anthropic path in claude.ts)
-      if (
-        event.type === 'message_stop' &&
-        usage.input_tokens + usage.output_tokens > 0
-      ) {
-        const costUSD = calculateUSDCost(openaiModel, usage as any)
-        addToTotalSessionCost(costUSD, usage as any, options.model)
+        }
      }

      // Also yield as StreamEvent for real-time display (matching Anthropic path)
@@ -353,6 +419,16 @@ export async function* queryModelOpenAI(
        ...(event.type === 'message_start' ? { ttftMs } : undefined),
      } as StreamEvent
    }
+
+    // Safety: if stream ended without message_stop, assemble and yield whatever we have
+    if (partialMessage) {
+      for (const output of assembleFinalAssistantOutputs({
+        partialMessage, contentBlocks, tools, agentId: options.agentId,
+        usage, stopReason, maxTokens,
+      })) {
+        yield output
+      }
+    }
  } catch (error) {
    const errorMessage = error instanceof Error ? error.message : String(error)
    logForDebugging(`[OpenAI] Error: ${errorMessage}`, { level: 'error' })
@@ -362,4 +438,4 @@ export async function* queryModelOpenAI(
      error: (error instanceof Error ? error : new Error(String(error))) as unknown as SDKAssistantMessageError,
    })
  }
-}
+}