feat: 集成豆包 ASR 语音识别后端，支持 /voice doubao 切换 (#357)

* feat: 集成豆包 ASR 语音识别后端，支持 /voice doubao 切换 - 新增 src/services/doubaoSTT.ts 适配模块，将 doubaoime-asr 的 AsyncGenerator 协议适配为现有 VoiceStreamConnection 接口 - /voice doubao 启用豆包后端，/voice 使用默认 Anthropic 后端 - 后端选择持久化到 settings.json 的 voiceProvider 字段 - 豆包后端跳过 Anthropic OAuth 认证、语言限制和 Focus Mode - 豆包后端松手即出结果，跳过 processing 状态 - 凭证文件存放在 ~/.claude/tts/doubao/credentials.json - doubaoime-asr 作为 optionalDependencies 安装 - 移除 /voice 命令的 claude-ai 可用性限制，所有用户可用 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> * docs: 更新 Voice Mode 文档，添加豆包 ASR 后端说明和致谢 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-15 12:55:51 +00:00 · 2026-04-25 13:57:30 +08:00
parent ad09f38fd1
commit 2e7fc428cd
13 changed files with 545 additions and 89 deletions
--- a/src/commands/voice/index.ts
+++ b/src/commands/voice/index.ts
@@ -1,17 +1,15 @@
 import type { Command } from '../../commands.js'
 import {
-  isVoiceGrowthBookEnabled,
-  isVoiceModeEnabled,
+  isVoiceAvailable,
 } from '../../voice/voiceModeEnabled.js'

 const voice = {
  type: 'local',
  name: 'voice',
-  description: 'Toggle voice mode',
-  availability: ['claude-ai'],
-  isEnabled: () => isVoiceGrowthBookEnabled(),
+  description: 'Toggle voice mode. Use /voice doubao for Doubao ASR backend',
+  isEnabled: () => isVoiceAvailable(),
  get isHidden() {
-    return !isVoiceModeEnabled()
+    return !isVoiceAvailable()
  },
  supportsNonInteractive: false,
  load: () => import('./voice.js'),
--- a/src/commands/voice/voice.ts
+++ b/src/commands/voice/voice.ts
@@ -2,29 +2,19 @@ import { normalizeLanguageForSTT } from '../../hooks/useVoice.js'
 import { getShortcutDisplay } from '../../keybindings/shortcutFormat.js'
 import { logEvent } from '../../services/analytics/index.js'
 import type { LocalCommandCall } from '../../types/command.js'
-import { isAnthropicAuthEnabled } from '../../utils/auth.js'
 import { getGlobalConfig, saveGlobalConfig } from '../../utils/config.js'
 import { settingsChangeDetector } from '../../utils/settings/changeDetector.js'
 import {
  getInitialSettings,
  updateSettingsForSource,
 } from '../../utils/settings/settings.js'
-import { isVoiceModeEnabled } from '../../voice/voiceModeEnabled.js'
+import { isVoiceAvailable } from '../../voice/voiceModeEnabled.js'

 const LANG_HINT_MAX_SHOWS = 2

-export const call: LocalCommandCall = async () => {
-  // Check auth and kill-switch before allowing voice mode
-  if (!isVoiceModeEnabled()) {
-    // Differentiate: OAuth-less users get an auth hint, everyone else
-    // gets nothing (command shouldn't be reachable when the kill-switch is on).
-    if (!isAnthropicAuthEnabled()) {
-      return {
-        type: 'text' as const,
-        value:
-          'Voice mode requires a Claude.ai account. Please run /login to sign in.',
-      }
-    }
+export const call: LocalCommandCall = async (args) => {
+  // Check kill-switch before allowing voice mode
+  if (!isVoiceAvailable()) {
    return {
      type: 'text' as const,
      value: 'Voice mode is not available.',
@@ -33,6 +23,47 @@ export const call: LocalCommandCall = async () => {

  const currentSettings = getInitialSettings()
  const isCurrentlyEnabled = currentSettings.voiceEnabled === true
+  const providerArg = args?.trim().toLowerCase()
+
+  // Handle provider argument when already enabled — switch backend only
+  if (isCurrentlyEnabled && providerArg === 'doubao') {
+    const result = updateSettingsForSource('userSettings', {
+      voiceProvider: 'doubao',
+    })
+    if (result.error) {
+      return {
+        type: 'text' as const,
+        value:
+          'Failed to update settings. Check your settings file for syntax errors.',
+      }
+    }
+    settingsChangeDetector.notifyChange('userSettings')
+    const key = getShortcutDisplay('voice:pushToTalk', 'Chat', 'Space')
+    return {
+      type: 'text' as const,
+      value: `Voice mode switched to Doubao ASR. Hold ${key} to record.`,
+    }
+  }
+
+  // Handle provider argument when already enabled — switch to anthropic
+  if (isCurrentlyEnabled && providerArg === 'anthropic') {
+    const result = updateSettingsForSource('userSettings', {
+      voiceProvider: 'anthropic',
+    })
+    if (result.error) {
+      return {
+        type: 'text' as const,
+        value:
+          'Failed to update settings. Check your settings file for syntax errors.',
+      }
+    }
+    settingsChangeDetector.notifyChange('userSettings')
+    const key = getShortcutDisplay('voice:pushToTalk', 'Chat', 'Space')
+    return {
+      type: 'text' as const,
+      value: `Voice mode switched to Anthropic STT. Hold ${key} to record.`,
+    }
+  }

  // Toggle OFF — no checks needed
  if (isCurrentlyEnabled) {
@@ -54,7 +85,10 @@ export const call: LocalCommandCall = async () => {
    }
  }

-  // Toggle ON — run pre-flight checks first
+  // Toggle ON — determine provider from argument or default
+  const provider = providerArg === 'doubao' ? 'doubao' : 'anthropic'
+
+  // Run pre-flight checks
  const { isVoiceStreamAvailable } = await import(
    '../../services/voiceStreamSTT.js'
  )
@@ -70,8 +104,8 @@ export const call: LocalCommandCall = async () => {
    }
  }

-  // Check for API key
-  if (!isVoiceStreamAvailable()) {
+  // Check for API key (only for Anthropic backend — Doubao uses its own credentials)
+  if (provider !== 'doubao' && !isVoiceStreamAvailable()) {
    return {
      type: 'text' as const,
      value:
@@ -111,8 +145,11 @@ export const call: LocalCommandCall = async () => {
    }
  }

-  // All checks passed — enable voice
-  const result = updateSettingsForSource('userSettings', { voiceEnabled: true })
+  // All checks passed — enable voice with provider
+  const result = updateSettingsForSource('userSettings', {
+    voiceEnabled: true,
+    ...(provider === 'doubao' ? { voiceProvider: 'doubao' } : {}),
+  })
  if (result.error) {
    return {
      type: 'text' as const,
@@ -123,28 +160,30 @@ export const call: LocalCommandCall = async () => {
  settingsChangeDetector.notifyChange('userSettings')
  logEvent('tengu_voice_toggled', { enabled: true })
  const key = getShortcutDisplay('voice:pushToTalk', 'Chat', 'Space')
-  const stt = normalizeLanguageForSTT(currentSettings.language)
-  const cfg = getGlobalConfig()
-  // Reset the hint counter whenever the resolved STT language changes
-  // (including first-ever enable, where lastLanguage is undefined).
-  const langChanged = cfg.voiceLangHintLastLanguage !== stt.code
-  const priorCount = langChanged ? 0 : (cfg.voiceLangHintShownCount ?? 0)
-  const showHint = !stt.fellBackFrom && priorCount < LANG_HINT_MAX_SHOWS
  let langNote = ''
-  if (stt.fellBackFrom) {
-    langNote = ` Note: "${stt.fellBackFrom}" is not a supported dictation language; using English. Change it via /config.`
-  } else if (showHint) {
-    langNote = ` Dictation language: ${stt.code} (/config to change).`
-  }
-  if (langChanged || showHint) {
-    saveGlobalConfig(prev => ({
-      ...prev,
-      voiceLangHintShownCount: priorCount + (showHint ? 1 : 0),
-      voiceLangHintLastLanguage: stt.code,
-    }))
+  const providerLabel = provider === 'doubao' ? 'Doubao ASR' : 'Anthropic'
+  // Doubao backend handles all languages natively — skip language hints
+  if (provider !== 'doubao') {
+    const stt = normalizeLanguageForSTT(currentSettings.language)
+    const cfg = getGlobalConfig()
+    const langChanged = cfg.voiceLangHintLastLanguage !== stt.code
+    const priorCount = langChanged ? 0 : (cfg.voiceLangHintShownCount ?? 0)
+    const showHint = !stt.fellBackFrom && priorCount < LANG_HINT_MAX_SHOWS
+    if (stt.fellBackFrom) {
+      langNote = ` Note: "${stt.fellBackFrom}" is not a supported dictation language; using English. Change it via /config.`
+    } else if (showHint) {
+      langNote = ` Dictation language: ${stt.code} (/config to change).`
+    }
+    if (langChanged || showHint) {
+      saveGlobalConfig(prev => ({
+        ...prev,
+        voiceLangHintShownCount: priorCount + (showHint ? 1 : 0),
+        voiceLangHintLastLanguage: stt.code,
+      }))
+    }
  }
  return {
    type: 'text' as const,
-    value: `Voice mode enabled. Hold ${key} to record.${langNote}`,
+    value: `Voice mode enabled (${providerLabel}). Hold ${key} to record.${langNote}`,
  }
 }
--- a/src/hooks/useVoice.ts
+++ b/src/hooks/useVoice.ts
@@ -20,6 +20,10 @@ import {
  isVoiceStreamAvailable,
  type VoiceStreamConnection,
 } from '../services/voiceStreamSTT.js'
+import {
+  connectDoubaoStream,
+  isDoubaoAvailableSync,
+} from '../services/doubaoSTT.js'
 import { logForDebugging } from '../utils/debug.js'
 import { toError } from '../utils/errors.js'
 import { getSystemLocaleLanguage } from '../utils/intl.js'
@@ -27,6 +31,10 @@ import { logError } from '../utils/log.js'
 import { getInitialSettings } from '../utils/settings/settings.js'
 import { sleep } from '../utils/sleep.js'

+function isDoubaoProvider(): boolean {
+  return getInitialSettings().voiceProvider === 'doubao'
+}
+
 // ─── Language normalization ─────────────────────────────────────────────

 const DEFAULT_STT_LANGUAGE = 'en'
@@ -574,7 +582,7 @@ export function useVoice({
  // stop when it loses focus. This enables a "multi-clauding army"
  // workflow where voice input follows window focus.
  useEffect(() => {
-    if (!enabled || !focusMode) {
+    if (!enabled || !focusMode || isDoubaoProvider()) {
      // Focus mode was disabled while a focus-driven recording was active —
      // stop the recording so it doesn't linger until the silence timer fires.
      if (focusTriggeredRef.current && stateRef.current === 'recording') {
@@ -778,7 +786,11 @@ export function useVoice({

    const attemptConnect = (keyterms: string[]): void => {
      const myAttemptGen = attemptGenRef.current
-      void connectVoiceStream(
+      // Select STT backend based on settings.voiceProvider
+      const connectFn = isDoubaoProvider()
+        ? (cbs: Parameters<typeof connectDoubaoStream>[0], opts: Parameters<typeof connectDoubaoStream>[1]) => connectDoubaoStream(cbs, opts)
+        : (cbs: Parameters<typeof connectVoiceStream>[0], opts: Parameters<typeof connectVoiceStream>[1]) => connectVoiceStream(cbs, opts)
+      void connectFn(
        {
          onTranscript: (text: string, isFinal: boolean) => {
            if (isStale()) return
@@ -1007,7 +1019,12 @@ export function useVoice({
      })
    }

-    void getVoiceKeyterms().then(attemptConnect)
+    // Doubao backend doesn't use keyterms — skip the async fetch
+    if (isDoubaoProvider()) {
+      attemptConnect([])
+    } else {
+      void getVoiceKeyterms().then(attemptConnect)
+    }
  }

  // ── Hold-to-talk handler ────────────────────────────────────────────
@@ -1021,7 +1038,8 @@ export function useVoice({
  // delay of ~500ms on macOS).
  const handleKeyEvent = useCallback(
    (fallbackMs = REPEAT_FALLBACK_MS): void => {
-      if (!enabled || !isVoiceStreamAvailable()) {
+      const sttAvailable = isDoubaoProvider() ? isDoubaoAvailableSync() : isVoiceStreamAvailable()
+      if (!enabled || !sttAvailable) {
        return
      }

--- a/src/hooks/useVoiceEnabled.ts
+++ b/src/hooks/useVoiceEnabled.ts
@@ -7,19 +7,22 @@ import {

 /**
 * Combines user intent (settings.voiceEnabled) with auth + GB kill-switch.
+ * When using Doubao backend, auth check is skipped (Doubao has its own credentials).
 * Only the auth half is memoized on authVersion — it's the expensive one
 * (cold getClaudeAIOAuthTokens memoize → sync `security` spawn, ~60ms/call,
 * ~180ms total in profile v5 when token refresh cleared the cache mid-session).
 * GB is a cheap cached-map lookup and stays outside the memo so a mid-session
 * kill-switch flip still takes effect on the next render.
- *
- * authVersion bumps on /login only. Background token refresh leaves it alone
- * (user is still authed), so the auth memo stays correct without re-eval.
 */
 export function useVoiceEnabled(): boolean {
  const userIntent = useAppState(s => s.settings.voiceEnabled === true)
+  const provider = useAppState(s => s.settings.voiceProvider)
+  // All hooks must be called unconditionally (Rules of Hooks)
  const authVersion = useAppState(s => s.authVersion)
  // eslint-disable-next-line react-hooks/exhaustive-deps
  const authed = useMemo(hasVoiceAuth, [authVersion])
+  if (provider === 'doubao') {
+    return userIntent && isVoiceGrowthBookEnabled()
+  }
  return userIntent && authed && isVoiceGrowthBookEnabled()
 }
--- a/src/services/doubaoSTT.ts
+++ b/src/services/doubaoSTT.ts
@@ -0,0 +1,230 @@
+// Doubao (豆包) ASR speech-to-text adapter for voice mode.
+//
+// Wraps the doubaoime-asr npm package to expose the same interface as
+// voiceStreamSTT.ts. The doubao backend uses an AsyncGenerator-based
+// streaming protocol internally; this adapter bridges it to the
+// send/finalize/close pattern used by useVoice.ts.
+
+import { homedir } from 'node:os'
+import type { ASRResponse } from 'doubaoime-asr'
+import type { FinalizeSource, VoiceStreamCallbacks, VoiceStreamConnection } from './voiceStreamSTT.js'
+import { logForDebugging } from '../utils/debug.js'
+import { logError } from '../utils/log.js'
+
+// Re-export FinalizeSource so useVoice can import from either module
+export type { FinalizeSource } from './voiceStreamSTT.js'
+
+// Maximum time to wait for the generator to finish after end-of-stream signal.
+const FINALIZE_SAFETY_TIMEOUT_MS = 5_000
+
+// ─── AsyncIterable audio queue ─────────────────────────────────────────
+
+// A push-based queue that implements AsyncIterable<Uint8Array>.
+// send() pushes chunks; push(null) signals end-of-stream.
+class AudioChunkQueue {
+  private chunks: (Uint8Array | null)[] = []
+  private waiting: ((result: IteratorResult<Uint8Array>) => void) | null = null
+  private done = false
+
+  push(chunk: Uint8Array | null): void {
+    if (this.done) return
+    if (chunk === null) {
+      this.done = true
+      if (this.waiting) {
+        const resolve = this.waiting
+        this.waiting = null
+        resolve({ value: undefined, done: true })
+      }
+      return
+    }
+    if (this.waiting) {
+      const resolve = this.waiting
+      this.waiting = null
+      resolve({ value: chunk, done: false })
+    } else {
+      this.chunks.push(chunk)
+    }
+  }
+
+  abort(): void {
+    this.done = true
+    this.chunks.length = 0
+    if (this.waiting) {
+      const resolve = this.waiting
+      this.waiting = null
+      resolve({ value: undefined, done: true })
+    }
+  }
+
+  [Symbol.asyncIterator](): AsyncIterator<Uint8Array> {
+    return {
+      next: async (): Promise<IteratorResult<Uint8Array>> => {
+        if (this.chunks.length > 0) {
+          const chunk = this.chunks.shift()!
+          return { value: chunk, done: false }
+        }
+        if (this.done) {
+          return { value: undefined, done: true }
+        }
+        return new Promise<IteratorResult<Uint8Array>>((resolve) => {
+          this.waiting = resolve
+        })
+      },
+    }
+  }
+}
+
+// ─── Availability ────────────────────────────────────────────────────────
+
+let doubaoAvailable: boolean | null = null
+
+export async function isDoubaoAvailable(): Promise<boolean> {
+  if (doubaoAvailable !== null) return doubaoAvailable
+  try {
+    await import('doubaoime-asr')
+    doubaoAvailable = true
+  } catch {
+    doubaoAvailable = false
+  }
+  return doubaoAvailable
+}
+
+// Synchronous check — returns cached result or optimistic true when
+// VOICE_PROVIDER=doubao is set and no cached result exists yet.
+// The actual import happens in connectDoubaoStream which reports errors.
+export function isDoubaoAvailableSync(): boolean {
+  if (doubaoAvailable !== null) return doubaoAvailable
+  return true
+}
+
+// ─── Connection ──────────────────────────────────────────────────────────
+
+export async function connectDoubaoStream(
+  callbacks: VoiceStreamCallbacks,
+  _options?: { language?: string },
+): Promise<VoiceStreamConnection | null> {
+  let doubaoAsr: typeof import('doubaoime-asr')
+  try {
+    doubaoAsr = await import('doubaoime-asr')
+  } catch {
+    logError(new Error('[doubao-asr] Failed to import doubaoime-asr package'))
+    callbacks.onError('doubaoime-asr package is not installed. Install it with: bun add doubaoime-asr', { fatal: true })
+    return null
+  }
+
+  const { transcribeRealtime, ASRConfig, ResponseType } = doubaoAsr
+
+  const queue = new AudioChunkQueue()
+  let finalized = false
+
+  // Resolve handle for finalize() promise — wrapped in an object to avoid
+  // TypeScript closure-scope type narrowing issues (TS2349 "not callable").
+  const finalizeHandle: { resolve: ((source: FinalizeSource) => void) | null } = { resolve: null }
+
+  const connection: VoiceStreamConnection = {
+    send(audioChunk: Buffer): void {
+      if (finalized) return
+      queue.push(new Uint8Array(audioChunk.buffer, audioChunk.byteOffset, audioChunk.byteLength))
+    },
+    finalize(): Promise<FinalizeSource> {
+      if (finalized) return Promise.resolve<FinalizeSource>('ws_already_closed')
+      finalized = true
+      queue.push(null) // signal end-of-stream to the generator
+      // Doubao returns FINAL_RESULT during recording — by the time the user
+      // releases the key, all transcripts are already in accumulatedRef.
+      // Resolve immediately so the UI skips the 'processing' state and goes
+      // straight to displaying the result.
+      logForDebugging('[doubao-asr] Finalize — resolving immediately')
+      return Promise.resolve<FinalizeSource>('post_closestream_endpoint')
+    },
+    close(): void {
+      finalized = true
+      queue.abort()
+      const r = finalizeHandle.resolve
+      finalizeHandle.resolve = null
+      if (r) r('ws_close')
+      callbacks.onClose()
+    },
+    isConnected(): boolean {
+      return true
+    },
+  }
+
+  // Start the ASR session in the background
+  const config = new ASRConfig({ credentialPath: `${homedir()}/.claude/tts/doubao/credentials.json` })
+
+  // Ensure credentials are initialized (may auto-generate)
+  try {
+    await config.ensureCredentials()
+  } catch (err) {
+    logError(new Error(`[doubao-asr] Credential initialization failed: ${String(err)}`))
+    callbacks.onError(`Doubao ASR 凭证初始化失败: ${String(err)}`, { fatal: true })
+    return null
+  }
+
+  // Fire onReady immediately — unlike the Anthropic WebSocket which needs to
+  // wait for a handshake, the doubao backend accepts audio through the queue
+  // and handles connection internally. The caller (useVoice.ts) needs onReady
+  // to fire before it will route audio chunks via connection.send().
+  logForDebugging('[doubao-asr] Firing onReady immediately')
+  callbacks.onReady(connection)
+
+  // Consume the AsyncGenerator in the background
+  void (async () => {
+    try {
+      const audioSource: AsyncIterable<Uint8Array> = queue
+      const gen: AsyncGenerator<ASRResponse> = transcribeRealtime(audioSource, { config })
+
+      for await (const resp of gen) {
+        if (finalized && resp.type !== ResponseType.FINAL_RESULT && resp.type !== ResponseType.SESSION_FINISHED) {
+          continue
+        }
+
+        switch (resp.type) {
+          case ResponseType.SESSION_STARTED:
+            logForDebugging('[doubao-asr] Session started')
+            break
+          case ResponseType.VAD_START:
+            logForDebugging('[doubao-asr] VAD detected speech start')
+            break
+          case ResponseType.INTERIM_RESULT:
+            if (resp.text) {
+              callbacks.onTranscript(resp.text, false)
+            }
+            break
+          case ResponseType.FINAL_RESULT:
+            if (resp.text) {
+              callbacks.onTranscript(resp.text, true)
+            }
+            break
+          case ResponseType.ERROR:
+            logError(new Error(`[doubao-asr] Error: ${resp.errorMsg}`))
+            if (!finalized) {
+              callbacks.onError(resp.errorMsg || 'Doubao ASR 识别错误')
+            }
+            break
+          case ResponseType.SESSION_FINISHED:
+            logForDebugging('[doubao-asr] Session finished')
+            break
+          default:
+            break
+        }
+      }
+
+      // Generator exhausted naturally
+      const r = finalizeHandle.resolve
+      finalizeHandle.resolve = null
+      if (r) r('post_closestream_endpoint')
+    } catch (err) {
+      logError(new Error(`[doubao-asr] Stream error: ${String(err)}`))
+      if (!finalized) {
+        callbacks.onError(`Doubao ASR 连接错误: ${String(err)}`)
+      }
+      const r2 = finalizeHandle.resolve
+      finalizeHandle.resolve = null
+      if (r2) r2('ws_close')
+    }
+  })()
+
+  return connection
+}
--- a/src/utils/settings/types.ts
+++ b/src/utils/settings/types.ts
@@ -880,6 +880,10 @@ export const SettingsSchema = lazySchema(() =>
              .boolean()
              .optional()
              .describe('Enable voice mode (hold-to-talk dictation)'),
+            voiceProvider: z
+              .enum(['anthropic', 'doubao'])
+              .optional()
+              .describe('Voice STT backend: "anthropic" (default) or "doubao" (Doubao ASR)'),
          }
        : {}),
      ...(feature('KAIROS')
--- a/src/utils/suggestions/tests/commandSuggestions.test.ts
+++ b/src/utils/suggestions/tests/commandSuggestions.test.ts
@@ -23,7 +23,7 @@ function makeCommand(name: string, opts?: Partial<Command>): Command {
    type: 'local',
    handler: () => {},
    ...opts,
-  } as Command
+  } as unknown as Command
 }

 function makePromptCommand(
@@ -37,7 +37,7 @@ function makePromptCommand(
    handler: () => {},
    source: 'userSettings',
    ...opts,
-  } as Command
+  } as unknown as Command
 }

 // ─── isCommandInput ───────────────────────────────────────────────────
--- a/src/voice/voiceModeEnabled.ts
+++ b/src/voice/voiceModeEnabled.ts
@@ -44,11 +44,18 @@ export function hasVoiceAuth(): boolean {
 }

 /**
- * Full runtime check: auth + GrowthBook kill-switch. Callers: `/voice`
- * (voice.ts, voice/index.ts), ConfigTool, VoiceModeNotice — command-time
- * paths where a fresh keychain read is acceptable. For React render
- * paths use useVoiceEnabled() instead (memoizes the auth half).
+ * Full runtime check for Anthropic voice_stream backend.
+ * Returns true when both auth + GrowthBook kill-switch pass.
 */
 export function isVoiceModeEnabled(): boolean {
  return hasVoiceAuth() && isVoiceGrowthBookEnabled()
 }
+
+/**
+ * Check if voice mode can be activated with any STT backend.
+ * Always returns true when VOICE_MODE feature flag is on and GrowthBook
+ * kill-switch is off — the Doubao backend does not require Anthropic auth.
+ */
+export function isVoiceAvailable(): boolean {
+  return isVoiceGrowthBookEnabled()
+}