feat: 添加 Provider Registry、StatusLine、Cache Stats 和其他增强

- providerRegistry: OpenAI 兼容 provider 切换（Cerebras/Groq/DeepSeek/Qwen） - StatusLine: 增强状态栏（缓存命中率、TTL 倒计时、自定义 shell 命令） - cacheStats: 缓存命中率和 token 签名追踪 - ultrareviewPreflight: 代码审查预检服务 - SkillsMenu/filterSkills: 技能菜单过滤增强 - MagicDocs/langfuse prompts: 提示词更新 - claude.ts: API 客户端更新 Co-Authored-By: glm-5-turbo <zai-org@claude-code-best.win>
2026-06-18 22:35:51 +00:00 · 2026-05-09 23:04:35 +08:00
parent fdddb6dbe8
commit efaf4afd9c
28 changed files with 3613 additions and 219 deletions
--- a/src/services/api/claude.ts
+++ b/src/services/api/claude.ts
@@ -93,7 +93,10 @@ import {
  asSystemPrompt,
  type SystemPrompt,
 } from '../../utils/systemPromptType.js'
-import { cloneDeep } from 'lodash-es'
+import {
+  getBreakCacheMarkerPath,
+  getBreakCacheAlwaysPath,
+} from '../../commands/break-cache/index.js'
 import { tokenCountFromLastAPIResponse } from '../../utils/tokens.js'
 import { getDynamicConfig_BLOCKS_ON_INIT } from '../analytics/growthbook.js'
 import {
@@ -121,6 +124,7 @@ import {
  getAfkModeHeaderLatched,
  getCacheEditingHeaderLatched,
  getFastModeHeaderLatched,
+  getLastApiCompletionTimestamp,
  getPromptCache1hAllowlist,
  getPromptCache1hEligible,
  getSessionId,
@@ -250,6 +254,7 @@ import {
  type NonNullableUsage,
 } from './logging.js'
 import {
+  CACHE_TTL_1HOUR_MS,
  checkResponseForCacheBreak,
  recordPromptState,
 } from './promptCacheBreakDetection.js'
@@ -507,30 +512,10 @@ export function getAPIMetadata() {
    }
  }

-  const deviceId = getOrCreateUserID()
-
-  // Third-party API providers (DeepSeek, etc.) validate user_id against
-  // ^[a-zA-Z0-9_-]+$ which rejects JSON strings containing {, ", :, etc.
-  // When using a non-Anthropic base URL, send only the device_id (hex string).
-  const baseUrl = process.env.ANTHROPIC_BASE_URL
-  const isThirdParty =
-    baseUrl &&
-    (() => {
-      try {
-        return new URL(baseUrl).host !== 'api.anthropic.com'
-      } catch {
-        return false
-      }
-    })()
-
-  if (isThirdParty) {
-    return { user_id: deviceId }
-  }
-
  return {
    user_id: jsonStringify({
      ...extra,
-      device_id: deviceId,
+      device_id: getOrCreateUserID(),
      // Only include OAuth account UUID when actively using OAuth authentication
      account_uuid: getOauthAccountInfo()?.accountUuid ?? '',
      session_id: getSessionId(),
@@ -1441,12 +1426,39 @@ async function* queryModel(
    ].filter(Boolean),
  )

+  // ── Break-cache integration ──
+  // If a one-time break-cache marker exists, or always-mode is on, append a
+  // unique ephemeral nonce comment to the system prompt so the prefix-cache
+  // hash changes for this request, forcing a cache miss.
+  {
+    const { existsSync, unlinkSync } = await import('node:fs')
+    const { randomUUID } = await import('node:crypto')
+    const onceMarker = getBreakCacheMarkerPath()
+    const alwaysFlag = getBreakCacheAlwaysPath()
+    const shouldBreak = existsSync(onceMarker) || existsSync(alwaysFlag)
+    if (shouldBreak) {
+      const nonce = randomUUID()
+      systemPrompt = asSystemPrompt([
+        ...systemPrompt,
+        `<!-- cache-break nonce: ${nonce} -->`,
+      ])
+      // Only delete the once marker; the always flag persists until /break-cache off
+      if (existsSync(onceMarker)) {
+        try {
+          unlinkSync(onceMarker)
+        } catch {
+          /* best-effort */
+        }
+      }
+    }
+  }
+
  // Prepend system prompt block for easy API identification
  logAPIPrefix(systemPrompt)

  const enablePromptCaching =
    options.enablePromptCaching ?? getPromptCachingEnabled(options.model)
-  let system = buildSystemPromptBlocks(systemPrompt, enablePromptCaching, {
+  const system = buildSystemPromptBlocks(systemPrompt, enablePromptCaching, {
    skipGlobalCacheForSystemPrompt: needsToolBasedCacheMarker,
    querySource: options.querySource,
  })
@@ -1466,7 +1478,7 @@ async function* queryModel(
      model: advisorModel,
    } as unknown as BetaToolUnion)
  }
-  let allTools = [...toolSchemas, ...extraToolSchemas]
+  const allTools = [...toolSchemas, ...extraToolSchemas]

  const isFastMode =
    isFastModeEnabled() &&
@@ -1590,39 +1602,6 @@ async function* queryModel(
  const consumedCacheEdits = cachedMCEnabled ? consumePendingCacheEdits() : null
  const consumedPinnedEdits = cachedMCEnabled ? getPinnedCacheEdits() : []

-  // ---------------------------------------------------------------------------
-  // Serialization boundary: deep-clone heavy data so the closure below captures
-  // independent copies, not references to the originals. After this point the
-  // original variables (messagesForAPI, system, allTools) are nulled out so
-  // they can be GC'd even while the generator/closure is still alive (during
-  // long streaming responses or retry backoff).
-  // ---------------------------------------------------------------------------
-  const frozenMessages = addCacheBreakpoints(
-    messagesForAPI,
-    enablePromptCaching,
-    options.querySource,
-    cachedMCEnabled &&
-      getAPIProvider() === 'firstParty' &&
-      options.querySource === 'repl_main_thread',
-    consumedCacheEdits as any,
-    consumedPinnedEdits as any,
-    options.skipCacheWrite,
-  )
-  const frozenSystem = cloneDeep(system)
-  const frozenTools = cloneDeep(allTools)
-
-  // Pre-compute scalars that post-streaming code needs, so messagesForAPI
-  // can be released before streaming starts.
-  const preMessagesCount = messagesForAPI.length
-  const preMessagesTokenCount = tokenCountFromLastAPIResponse(messagesForAPI)
-
-  // Release originals for GC — the frozen* copies and pre-computed scalars
-  // are now the only references to this data inside the closure.
-  // After null-out, all downstream code uses frozen* or pre-computed scalars.
-  messagesForAPI = null!
-  system = null!
-  allTools = null!
-
  // Capture the betas sent in the last API request, including the ones that
  // were dynamically added, so we can log and send it to telemetry.
  let lastRequestBetas: string[] | undefined
@@ -1725,6 +1704,9 @@ async function* queryModel(
      clearAllThinking: false,
    })

+    const enablePromptCaching =
+      options.enablePromptCaching ?? getPromptCachingEnabled(retryContext.model)
+
    // Fast mode: header is latched session-stable (cache-safe), but
    // `speed='fast'` stays dynamic so cooldown still suppresses the actual
    // fast-mode request without changing the cache key.
@@ -1755,10 +1737,13 @@ async function* queryModel(
      }
    }

-    // Cache editing beta: header is latched session-stable.
-    // The useCachedMC gate (cache_edits body behavior) is baked into
-    // frozenMessages at the serialization boundary above, so this block
-    // only controls the beta header.
+    // Cache editing beta: header is latched session-stable; useCachedMC
+    // (controls cache_edits body behavior) stays live so edits stop when
+    // the feature disables but the header doesn't flip.
+    const useCachedMC =
+      cachedMCEnabled &&
+      getAPIProvider() === 'firstParty' &&
+      options.querySource === 'repl_main_thread'
    if (
      cacheEditingHeaderLatched &&
      cacheEditingBetaHeader &&
@@ -1787,9 +1772,17 @@ async function* queryModel(

    return {
      model: normalizeModelStringForAPI(options.model),
-      messages: frozenMessages,
-      system: frozenSystem,
-      tools: frozenTools,
+      messages: addCacheBreakpoints(
+        messagesForAPI,
+        enablePromptCaching,
+        options.querySource,
+        useCachedMC,
+        consumedCacheEdits as any,
+        consumedPinnedEdits as any,
+        options.skipCacheWrite,
+      ),
+      system,
+      tools: allTools,
      tool_choice: options.toolChoice,
      ...(useBetas && { betas: filteredBetas }),
      metadata: getAPIMetadata(),
@@ -1849,9 +1842,6 @@ async function* queryModel(
  let ttftMs = 0
  let partialMessage: BetaMessage | undefined
  const contentBlocks: (BetaContentBlock | ConnectorTextBlock)[] = []
-  // Accumulate streaming deltas in arrays to avoid O(n²) string concatenation.
-  // Joined and assigned to contentBlock fields at content_block_stop.
-  const streamingDeltas = new Map<number, string[]>()
  let usage: NonNullableUsage = EMPTY_USAGE
  let costUSD = 0
  let stopReason: BetaStopReason | null = null
@@ -2138,8 +2128,6 @@ async function* queryModel(
                }
                break
            }
-            // Initialize delta accumulator for this content block
-            streamingDeltas.set(part.index, [])
            break
          case 'content_block_delta': {
            const contentBlock = contentBlocks[part.index]
@@ -2169,9 +2157,8 @@ async function* queryModel(
                })
                throw new Error('Content block is not a connector_text block')
              }
-              streamingDeltas
-                .get(part.index)
-                ?.push(delta.connector_text as string)
+              ;(contentBlock as { connector_text: string }).connector_text +=
+                delta.connector_text
            } else {
              switch (delta.type) {
                case 'citations_delta':
@@ -2201,9 +2188,7 @@ async function* queryModel(
                    })
                    throw new Error('Content block input is not a string')
                  }
-                  streamingDeltas
-                    .get(part.index)
-                    ?.push(delta.partial_json as string)
+                  contentBlock.input += delta.partial_json
                  break
                case 'text_delta':
                  if (contentBlock.type !== 'text') {
@@ -2217,7 +2202,7 @@ async function* queryModel(
                    })
                    throw new Error('Content block is not a text block')
                  }
-                  streamingDeltas.get(part.index)?.push(delta.text!)
+                  ;(contentBlock as { text: string }).text += delta.text
                  break
                case 'signature_delta':
                  if (
@@ -2252,7 +2237,8 @@ async function* queryModel(
                    })
                    throw new Error('Content block is not a thinking block')
                  }
-                  streamingDeltas.get(part.index)?.push(delta.thinking!)
+                  ;(contentBlock as { thinking: string }).thinking +=
+                    delta.thinking
                  break
              }
            }
@@ -2284,32 +2270,6 @@ async function* queryModel(
              })
              throw new Error('Message not found')
            }
-            // Join accumulated streaming deltas into the contentBlock fields
-            // to avoid O(n²) string concatenation during streaming.
-            const deltas = streamingDeltas.get(part.index)
-            if (deltas && deltas.length > 0) {
-              const joined = deltas.join('')
-              switch (contentBlock.type) {
-                case 'text':
-                  ;(contentBlock as { text: string }).text = joined
-                  break
-                case 'thinking':
-                  ;(contentBlock as { thinking: string }).thinking = joined
-                  break
-                case 'tool_use':
-                case 'server_tool_use':
-                  contentBlock.input = joined
-                  break
-                default:
-                  if ((contentBlock.type as string) === 'connector_text') {
-                    ;(
-                      contentBlock as { connector_text: string }
-                    ).connector_text = joined
-                  }
-                  break
-              }
-              streamingDeltas.delete(part.index)
-            }
            const m: AssistantMessage = {
              message: {
                ...partialMessage,
@@ -2864,8 +2824,8 @@ async function* queryModel(
        logAPIError({
          error,
          model: errorModel,
-          messageCount: preMessagesCount,
-          messageTokens: preMessagesTokenCount,
+          messageCount: messagesForAPI.length,
+          messageTokens: tokenCountFromLastAPIResponse(messagesForAPI),
          durationMs: Date.now() - start,
          durationMsIncludingRetries: Date.now() - startIncludingRetries,
          attempt: attemptNumber,
@@ -2886,10 +2846,7 @@ async function* queryModel(

        yield getAssistantMessageFromError(error, errorModel, {
          messages,
-          messagesForAPI: frozenMessages as unknown as (
-            | UserMessage
-            | AssistantMessage
-          )[],
+          messagesForAPI,
        })
        releaseStreamResources()
        return
@@ -2923,8 +2880,8 @@ async function* queryModel(
      logAPIError({
        error,
        model: errorModel,
-        messageCount: preMessagesCount,
-        messageTokens: preMessagesTokenCount,
+        messageCount: messagesForAPI.length,
+        messageTokens: tokenCountFromLastAPIResponse(messagesForAPI),
        durationMs: Date.now() - start,
        durationMsIncludingRetries: Date.now() - startIncludingRetries,
        attempt: attemptNumber,
@@ -2947,10 +2904,7 @@ async function* queryModel(

      yield getAssistantMessageFromError(error, errorModel, {
        messages,
-        messagesForAPI: frozenMessages as unknown as (
-          | UserMessage
-          | AssistantMessage
-        )[],
+        messagesForAPI,
      })
      releaseStreamResources()
      return
@@ -3006,19 +2960,14 @@ async function* queryModel(
  // Precompute scalars so the fire-and-forget .then() closure doesn't pin the
  // full messagesForAPI array (the entire conversation up to the context window
  // limit) until getToolPermissionContext() resolves.
-  // Note: messagesForAPI was nulled above (serialization boundary), so we use
-  // the pre-computed scalars captured before the null-out.
-  const logMessageCount = preMessagesCount
-  const logMessageTokens = preMessagesTokenCount
+  const logMessageCount = messagesForAPI.length
+  const logMessageTokens = tokenCountFromLastAPIResponse(messagesForAPI)

  // Record LLM observation in Langfuse (no-op if not configured)
  recordLLMObservation(options.langfuseTrace ?? null, {
    model: resolvedModel,
    provider: getAPIProvider(),
-    input: convertMessagesToLangfuse(
-      frozenMessages as Parameters<typeof convertMessagesToLangfuse>[0],
-      systemPrompt,
-    ),
+    input: convertMessagesToLangfuse(messagesForAPI, systemPrompt),
    output: convertOutputToLangfuse(newMessages),
    usage: {
      input_tokens: usage.input_tokens,