From 2e7fc428cd7fab7926af01fc3ed3c680468d33a4 Mon Sep 17 00:00:00 2001 From: claude-code-best Date: Sat, 25 Apr 2026 13:57:30 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E9=9B=86=E6=88=90=E8=B1=86=E5=8C=85=20?= =?UTF-8?q?ASR=20=E8=AF=AD=E9=9F=B3=E8=AF=86=E5=88=AB=E5=90=8E=E7=AB=AF?= =?UTF-8?q?=EF=BC=8C=E6=94=AF=E6=8C=81=20/voice=20doubao=20=E5=88=87?= =?UTF-8?q?=E6=8D=A2=20(#357)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: 集成豆包 ASR 语音识别后端,支持 /voice doubao 切换 - 新增 src/services/doubaoSTT.ts 适配模块,将 doubaoime-asr 的 AsyncGenerator 协议适配为现有 VoiceStreamConnection 接口 - /voice doubao 启用豆包后端,/voice 使用默认 Anthropic 后端 - 后端选择持久化到 settings.json 的 voiceProvider 字段 - 豆包后端跳过 Anthropic OAuth 认证、语言限制和 Focus Mode - 豆包后端松手即出结果,跳过 processing 状态 - 凭证文件存放在 ~/.claude/tts/doubao/credentials.json - doubaoime-asr 作为 optionalDependencies 安装 - 移除 /voice 命令的 claude-ai 可用性限制,所有用户可用 Co-Authored-By: Claude Opus 4.7 * docs: 更新 Voice Mode 文档,添加豆包 ASR 后端说明和致谢 Co-Authored-By: Claude Opus 4.7 --------- Co-authored-by: Claude Opus 4.7 --- .gitignore | 1 + README.md | 6 +- bun.lock | 13 +- docs/features/voice-mode.md | 198 ++++++++++++--- package.json | 3 + src/commands/voice/index.ts | 10 +- src/commands/voice/voice.ts | 115 ++++++--- src/hooks/useVoice.ts | 26 +- src/hooks/useVoiceEnabled.ts | 9 +- src/services/doubaoSTT.ts | 230 ++++++++++++++++++ src/utils/settings/types.ts | 4 + .../__tests__/commandSuggestions.test.ts | 4 +- src/voice/voiceModeEnabled.ts | 15 +- 13 files changed, 545 insertions(+), 89 deletions(-) create mode 100644 src/services/doubaoSTT.ts diff --git a/.gitignore b/.gitignore index f84c208ba..bb75ce640 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,4 @@ data !.codex/prompts/ !.codex/prompts/** teach-me +credentials.json diff --git a/README.md b/README.md index 4d4130289..6a69ccd3c 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ | **Poor Mode** | 穷鬼模式,关闭记忆提取和键入建议,大幅度减少并发请求 | /poor 可以开关 | | **Channels 频道通知** | MCP 服务器推送外部消息到会话(飞书/Slack/Discord/微信等),`--channels plugin:name@marketplace` 启用 | [文档](https://ccb.agent-aura.top/docs/features/channels) | | **自定义模型供应商** | OpenAI/Anthropic/Gemini/Grok 兼容 | [文档](https://ccb.agent-aura.top/docs/features/custom-platform-login) | -| Voice Mode | Push-to-Talk 语音输入 | [文档](https://ccb.agent-aura.top/docs/features/voice-mode) | +| Voice Mode | 语音输入,支持豆包语言输入(`/voice doubao`) | [文档](https://ccb.agent-aura.top/docs/features/voice-mode) | | Computer Use | 屏幕截图、键鼠控制 | [文档](https://ccb.agent-aura.top/docs/features/computer-use) | | Chrome Use | 浏览器自动化、表单填写、数据抓取 | [自托管](https://ccb.agent-aura.top/docs/features/chrome-use-mcp) [原生版](https://ccb.agent-aura.top/docs/features/claude-in-chrome-mcp) | | Sentry | 企业级错误追踪 | [文档](https://ccb.agent-aura.top/docs/internals/sentry-setup) | @@ -233,6 +233,10 @@ TUI (REPL) 模式需要真实终端,无法直接通过 VS Code launch 启动 +## 致谢 + +- [doubaoime-asr](https://github.com/starccy/doubaoime-asr) — 豆包 ASR 语音识别 SDK,为 Voice Mode 提供无需 Anthropic OAuth 的语音输入方案 + ## 许可证 本项目仅供学习研究用途。Claude Code 的所有权利归 [Anthropic](https://www.anthropic.com/) 所有。 diff --git a/bun.lock b/bun.lock index 84ba36a18..d27afe1ec 100644 --- a/bun.lock +++ b/bun.lock @@ -145,6 +145,9 @@ "yaml": "^2.8.3", "zod": "^4.3.6", }, + "optionalDependencies": { + "doubaoime-asr": "^0.1.0", + }, }, "packages/@ant/claude-for-chrome-mcp": { "name": "@ant/claude-for-chrome-mcp", @@ -1791,6 +1794,8 @@ "dompurify": ["dompurify@3.4.0", "https://registry.npmmirror.com/dompurify/-/dompurify-3.4.0.tgz", { "optionalDependencies": { "@types/trusted-types": "^2.0.7" } }, "sha512-nolgK9JcaUXMSmW+j1yaSvaEaoXYHwWyGJlkoCTghc97KgGDDSnpoU/PlEnw63Ah+TGKFOyY+X5LnxaWbCSfXg=="], + "doubaoime-asr": ["doubaoime-asr@0.1.0", "", { "dependencies": { "opus-encdec": "^0.1.1", "protobufjs": "^8.0.0", "ws": "^8.18.0" }, "bin": { "doubaoime-asr": "bin/doubaoime-asr.mjs" } }, "sha512-HYUfHkTxNdOoztXwS18e6GBRLY9dSDWX43K4WvPvEmO6+RevO6WbawMMoUfHKPb4ySQn461un7XyN5l4UGejwg=="], + "dunder-proto": ["dunder-proto@1.0.1", "https://registry.npmmirror.com/dunder-proto/-/dunder-proto-1.0.1.tgz", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], "ecdsa-sig-formatter": ["ecdsa-sig-formatter@1.0.11", "https://registry.npmmirror.com/ecdsa-sig-formatter/-/ecdsa-sig-formatter-1.0.11.tgz", { "dependencies": { "safe-buffer": "^5.0.1" } }, "sha512-nagl3RYrbNv6kQkeJIpt6NJZy8twLB/2vtz6yN9Z4vRKHN4/QZJIEbqohALSgwKdnksuY3k5Addp5lg8sVoVcQ=="], @@ -2343,6 +2348,8 @@ "openai": ["openai@6.34.0", "https://registry.npmmirror.com/openai/-/openai-6.34.0.tgz", { "peerDependencies": { "ws": "^8.18.0", "zod": "^3.25 || ^4.0" }, "optionalPeers": ["ws", "zod"], "bin": { "openai": "bin/cli" } }, "sha512-yEr2jdGf4tVFYG6ohmr3pF6VJuveP0EA/sS8TBx+4Eq5NT10alu5zg2dmxMXMgqpihRDQlFGpRt2XwsGj+Fyxw=="], + "opus-encdec": ["opus-encdec@0.1.1", "", {}, "sha512-TDzyGqYqrwn5UEUNaLsfLGu8Ma+HRNrgLYj7Vx5wfTnafAA21G6Bnm/qTIa3orQi/yZPZYmkdpO/gez4nfA1Rw=="], + "os-tmpdir": ["os-tmpdir@1.0.2", "https://registry.npmmirror.com/os-tmpdir/-/os-tmpdir-1.0.2.tgz", {}, "sha512-D2FR03Vir7FIu45XBY20mTb+/ZSWB00sjU9jdQXt83gDrI4Ztz5Fs7/yy74g2N5SVQY4xY1qDr4rNddwYRVX0g=="], "oxc-parser": ["oxc-parser@0.121.0", "https://registry.npmmirror.com/oxc-parser/-/oxc-parser-0.121.0.tgz", { "dependencies": { "@oxc-project/types": "^0.121.0" }, "optionalDependencies": { "@oxc-parser/binding-android-arm-eabi": "0.121.0", "@oxc-parser/binding-android-arm64": "0.121.0", "@oxc-parser/binding-darwin-arm64": "0.121.0", "@oxc-parser/binding-darwin-x64": "0.121.0", "@oxc-parser/binding-freebsd-x64": "0.121.0", "@oxc-parser/binding-linux-arm-gnueabihf": "0.121.0", "@oxc-parser/binding-linux-arm-musleabihf": "0.121.0", "@oxc-parser/binding-linux-arm64-gnu": "0.121.0", "@oxc-parser/binding-linux-arm64-musl": "0.121.0", "@oxc-parser/binding-linux-ppc64-gnu": "0.121.0", "@oxc-parser/binding-linux-riscv64-gnu": "0.121.0", "@oxc-parser/binding-linux-riscv64-musl": "0.121.0", "@oxc-parser/binding-linux-s390x-gnu": "0.121.0", "@oxc-parser/binding-linux-x64-gnu": "0.121.0", "@oxc-parser/binding-linux-x64-musl": "0.121.0", "@oxc-parser/binding-openharmony-arm64": "0.121.0", "@oxc-parser/binding-wasm32-wasi": "0.121.0", "@oxc-parser/binding-win32-arm64-msvc": "0.121.0", "@oxc-parser/binding-win32-ia32-msvc": "0.121.0", "@oxc-parser/binding-win32-x64-msvc": "0.121.0" } }, "sha512-ek9o58+SCv6AV7nchiAcUJy1DNE2CC5WRdBcO0mF+W4oRjNQfPO7b3pLjTHSFECpHkKGOZSQxx3hk8viIL5YCg=="], @@ -2435,7 +2442,7 @@ "property-information": ["property-information@7.1.0", "https://registry.npmmirror.com/property-information/-/property-information-7.1.0.tgz", {}, "sha512-TwEZ+X+yCJmYfL7TPUOcvBZ4QfoT5YenQiJuX//0th53DE6w0xxLEtfK3iyryQFddXuvkIk51EEgrJQ0WJkOmQ=="], - "protobufjs": ["protobufjs@7.5.4", "https://registry.npmmirror.com/protobufjs/-/protobufjs-7.5.4.tgz", { "dependencies": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", "@protobufjs/codegen": "^2.0.4", "@protobufjs/eventemitter": "^1.1.0", "@protobufjs/fetch": "^1.1.0", "@protobufjs/float": "^1.0.2", "@protobufjs/inquire": "^1.1.0", "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.0", "@types/node": ">=13.7.0", "long": "^5.0.0" } }, "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg=="], + "protobufjs": ["protobufjs@8.0.1", "", { "dependencies": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", "@protobufjs/codegen": "^2.0.4", "@protobufjs/eventemitter": "^1.1.0", "@protobufjs/fetch": "^1.1.0", "@protobufjs/float": "^1.0.2", "@protobufjs/inquire": "^1.1.0", "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.0", "@types/node": ">=13.7.0", "long": "^5.0.0" } }, "sha512-NWWCCscLjs+cOKF/s/XVNFRW7Yih0fdH+9brffR5NZCy8k42yRdl5KlWKMVXuI1vfCoy4o1z80XR/W/QUb3V3w=="], "proxy-addr": ["proxy-addr@2.0.7", "https://registry.npmmirror.com/proxy-addr/-/proxy-addr-2.0.7.tgz", { "dependencies": { "forwarded": "0.2.0", "ipaddr.js": "1.9.1" } }, "sha512-llQsMLSUDUPT44jdrU/O37qlnifitDP+ZwrmmZcoSKyLKvtZxpyV0n2/bD/N4tBAAZ/gJEdZU7KMraoK1+XYAg=="], @@ -3029,6 +3036,8 @@ "@fastify/otel/@opentelemetry/instrumentation": ["@opentelemetry/instrumentation@0.212.0", "https://registry.npmmirror.com/@opentelemetry/instrumentation/-/instrumentation-0.212.0.tgz", { "dependencies": { "@opentelemetry/api-logs": "0.212.0", "import-in-the-middle": "^2.0.6", "require-in-the-middle": "^8.0.0" }, "peerDependencies": { "@opentelemetry/api": "^1.3.0" } }, "sha512-IyXmpNnifNouMOe0I/gX7ENfv2ZCNdYTF0FpCsoBcpbIHzk81Ww9rQTYTnvghszCg7qGrIhNvWC8dhEifgX9Jg=="], + "@grpc/proto-loader/protobufjs": ["protobufjs@7.5.4", "https://registry.npmmirror.com/protobufjs/-/protobufjs-7.5.4.tgz", { "dependencies": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", "@protobufjs/codegen": "^2.0.4", "@protobufjs/eventemitter": "^1.1.0", "@protobufjs/fetch": "^1.1.0", "@protobufjs/float": "^1.0.2", "@protobufjs/inquire": "^1.1.0", "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.0", "@types/node": ">=13.7.0", "long": "^5.0.0" } }, "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg=="], + "@grpc/proto-loader/yargs": ["yargs@17.7.2", "https://registry.npmmirror.com/yargs/-/yargs-17.7.2.tgz", { "dependencies": { "cliui": "^8.0.1", "escalade": "^3.1.1", "get-caller-file": "^2.0.5", "require-directory": "^2.1.1", "string-width": "^4.2.3", "y18n": "^5.0.5", "yargs-parser": "^21.1.1" } }, "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w=="], "@hono/node-ws/@hono/node-server": ["@hono/node-server@1.19.13", "https://registry.npmmirror.com/@hono/node-server/-/node-server-1.19.13.tgz", { "peerDependencies": { "hono": "^4" } }, "sha512-TsQLe4i2gvoTtrHje625ngThGBySOgSK3Xo2XRYOdqGN1teR8+I7vchQC46uLJi8OF62YTYA3AhSpumtkhsaKQ=="], @@ -3123,6 +3132,8 @@ "@opentelemetry/otlp-transformer/@opentelemetry/sdk-trace-base": ["@opentelemetry/sdk-trace-base@2.6.1", "https://registry.npmmirror.com/@opentelemetry/sdk-trace-base/-/sdk-trace-base-2.6.1.tgz", { "dependencies": { "@opentelemetry/core": "2.6.1", "@opentelemetry/resources": "2.6.1", "@opentelemetry/semantic-conventions": "^1.29.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.3.0 <1.10.0" } }, "sha512-r86ut4T1e8vNwB35CqCcKd45yzqH6/6Wzvpk2/cZB8PsPLlZFTvrh8yfOS3CYZYcUmAx4hHTZJ8AO8Dj8nrdhw=="], + "@opentelemetry/otlp-transformer/protobufjs": ["protobufjs@7.5.4", "https://registry.npmmirror.com/protobufjs/-/protobufjs-7.5.4.tgz", { "dependencies": { "@protobufjs/aspromise": "^1.1.2", "@protobufjs/base64": "^1.1.2", "@protobufjs/codegen": "^2.0.4", "@protobufjs/eventemitter": "^1.1.0", "@protobufjs/fetch": "^1.1.0", "@protobufjs/float": "^1.0.2", "@protobufjs/inquire": "^1.1.0", "@protobufjs/path": "^1.1.2", "@protobufjs/pool": "^1.1.0", "@protobufjs/utf8": "^1.1.0", "@types/node": ">=13.7.0", "long": "^5.0.0" } }, "sha512-CvexbZtbov6jW2eXAvLukXjXUW1TzFaivC46BpWc/3BpcCysb5Vffu+B3XHMm8lVEuy2Mm4XGex8hBSg1yapPg=="], + "@opentelemetry/sdk-logs/@opentelemetry/core": ["@opentelemetry/core@2.6.1", "https://registry.npmmirror.com/@opentelemetry/core/-/core-2.6.1.tgz", { "dependencies": { "@opentelemetry/semantic-conventions": "^1.29.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.0.0 <1.10.0" } }, "sha512-8xHSGWpJP9wBxgBpnqGL0R3PbdWQndL1Qp50qrg71+B28zK5OQmUgcDKLJgzyAAV38t4tOyLMGDD60LneR5W8g=="], "@opentelemetry/sdk-logs/@opentelemetry/resources": ["@opentelemetry/resources@2.6.1", "https://registry.npmmirror.com/@opentelemetry/resources/-/resources-2.6.1.tgz", { "dependencies": { "@opentelemetry/core": "2.6.1", "@opentelemetry/semantic-conventions": "^1.29.0" }, "peerDependencies": { "@opentelemetry/api": ">=1.3.0 <1.10.0" } }, "sha512-lID/vxSuKWXM55XhAKNoYXu9Cutoq5hFdkbTdI/zDKQktXzcWBVhNsOkiZFTMU9UtEWuGRNe0HUgmsFldIdxVA=="], diff --git a/docs/features/voice-mode.md b/docs/features/voice-mode.md index b2a6d5eee..269c54c90 100644 --- a/docs/features/voice-mode.md +++ b/docs/features/voice-mode.md @@ -1,27 +1,32 @@ # VOICE_MODE — 语音输入 > Feature Flag: `FEATURE_VOICE_MODE=1` -> 实现状态:完整可用(需要 Anthropic OAuth) +> 实现状态:完整可用(双后端:Anthropic OAuth / 豆包 ASR) > 引用数:46 ## 一、功能概述 -VOICE_MODE 实现"按键说话"(Push-to-Talk)语音输入。用户按住空格键录音,音频通过 WebSocket 流式传输到 Anthropic STT 端点(Nova 3),实时转录显示在终端中。 +VOICE_MODE 实现"按键说话"(Push-to-Talk)语音输入。用户按住空格键录音,音频流式传输到 STT 后端,实时转录显示在终端中。支持两个后端: + +- **Anthropic STT(默认)**:通过 WebSocket 流式传输到 Nova 3 端点,需要 Anthropic OAuth +- **豆包 ASR(Doubao)**:通过 `doubaoime-asr` 包的 AsyncGenerator 协议流式识别,使用独立凭证文件,无需 Anthropic OAuth ### 核心特性 - **Push-to-Talk**:长按空格键录音,释放后自动发送 - **流式转录**:录音过程中实时显示中间转录结果 - **无缝集成**:转录文本直接作为用户消息提交到对话 +- **双后端切换**:通过 `/voice` 命令参数选择 STT 后端,持久化到 settings.json ## 二、用户交互 | 操作 | 行为 | |------|------| | 长按空格 | 开始录音,显示录音状态 | -| 释放空格 | 停止录音,等待最终转录 | -| 转录完成 | 自动插入到输入框并提交 | -| `/voice` 命令 | 切换语音模式开关 | +| 释放空格 | 停止录音,转录结果自动提交 | +| `/voice` | 切换语音模式开关(默认使用 Anthropic 后端) | +| `/voice doubao` | 启用语音模式并使用豆包 ASR 后端 | +| `/voice anthropic` | 切换回 Anthropic STT 后端 | ### UI 反馈 @@ -35,26 +40,37 @@ VOICE_MODE 实现"按键说话"(Push-to-Talk)语音输入。用户按住空 文件:`src/voice/voiceModeEnabled.ts` -三层检查: +两层检查函数: ```ts +// Anthropic 后端(需要 OAuth) isVoiceModeEnabled() = hasVoiceAuth() && isVoiceGrowthBookEnabled() + +// 豆包后端 / 通用可用性检查(不需要 OAuth) +isVoiceAvailable() = isVoiceGrowthBookEnabled() ``` 1. **Feature Flag**:`feature('VOICE_MODE')` — 编译时/运行时开关 2. **GrowthBook Kill-Switch**:`!getFeatureValue_CACHED_MAY_BE_STALE('tengu_amber_quartz_disabled', false)` — 紧急关闭开关(默认 false = 未禁用) -3. **Auth 检查**:`hasVoiceAuth()` — 需要 Anthropic OAuth token(非 API key) +3. **Auth 检查(仅 Anthropic)**:`hasVoiceAuth()` — 需要 Anthropic OAuth token(非 API key) +4. **Provider 检查**:`voiceProvider` 设置决定使用哪个后端,豆包后端跳过 OAuth 检查 ### 3.2 核心模块 | 模块 | 职责 | |------|------| | `src/voice/voiceModeEnabled.ts` | Feature flag + GrowthBook + Auth 三层门控 | -| `src/hooks/useVoice.ts` | React hook 管理录音状态和 WebSocket 连接 | -| `src/services/voiceStreamSTT.ts` | WebSocket 流式传输到 Anthropic STT | +| `src/hooks/useVoice.ts` | React hook 管理录音状态和后端连接 | +| `src/services/voiceStreamSTT.ts` | Anthropic WebSocket 流式 STT | +| `src/services/doubaoSTT.ts` | 豆包 ASR 适配器(AsyncGenerator → VoiceStreamConnection) | +| `src/commands/voice/voice.ts` | `/voice` 命令实现,处理后端选择和持久化 | +| `src/hooks/useVoiceEnabled.ts` | 语音启用状态 hook,根据 provider 决定是否跳过 OAuth | +| `src/utils/settings/types.ts` | `voiceProvider: 'anthropic' | 'doubao'` 设置类型定义 | ### 3.3 数据流 +#### Anthropic 后端 + ``` 用户按下空格键 │ @@ -79,20 +95,108 @@ WebSocket 连接到 Anthropic STT 端点 转录文本 → 插入输入框 → 自动提交 ``` +#### 豆包 ASR 后端 + +``` +用户按下空格键 + │ + ▼ +useVoice hook 激活(检测到 voiceProvider === 'doubao') + │ + ▼ +macOS 原生音频 / SoX 开始录音 + │ + ▼ +connectDoubaoStream() 创建 AudioChunkQueue + VoiceStreamConnection + │ + ├──→ onReady 立即触发(无需等待握手) + │ + ▼ +音频数据通过 AudioChunkQueue 传入 transcribeRealtime() + │ + ├──→ INTERIM_RESULT → 实时显示中间转录 + ├──→ FINAL_RESULT → 显示最终转录 + │ + ▼ +用户释放空格键 + │ + ▼ +finalize() 立即返回(豆包在录音过程中已返回结果,无需等待) + │ + ▼ +转录文本 → 插入输入框 → 自动提交 +``` + ### 3.4 音频录制 -支持两种音频后端: +支持两种音频后端(两个 STT 后端共享): - **macOS 原生音频**:优先使用,低延迟 - **SoX(Sound eXchange)**:回退方案,跨平台 -音频流通过 WebSocket 发送到 Anthropic 的 Nova 3 STT 模型。 +### 3.5 豆包 ASR 适配器设计 + +文件:`src/services/doubaoSTT.ts` + +豆包后端使用适配器模式,将 `doubaoime-asr` 的 AsyncGenerator 协议桥接到 `VoiceStreamConnection` 接口: + +**AudioChunkQueue** — push 式异步队列: +- 实现 `AsyncIterable` 接口 +- `push(chunk)` 将音频数据入队,`push(null)` 发送结束信号 +- 内部维护等待者(waiting)和缓冲队列(chunks)两个状态 + +**connectDoubaoStream()** — 连接入口: +- 动态导入 `doubaoime-asr`(optionalDependencies) +- 从 `~/.claude/tts/doubao/credentials.json` 加载凭证 +- 创建 AudioChunkQueue 和 VoiceStreamConnection +- 立即触发 `onReady`(避免与 useVoice 的音频缓冲死锁) +- `finalize()` 立即返回(豆包在录音过程中已返回结果) +- 后台 async IIFE 消费 `transcribeRealtime` generator,映射响应类型到回调 + +**响应类型映射**: + +| doubaoime-asr ResponseType | 回调映射 | +|----------------------------|----------| +| SESSION_STARTED | 日志记录 | +| VAD_START | 日志记录 | +| INTERIM_RESULT | `onTranscript(text, false)` | +| FINAL_RESULT | `onTranscript(text, true)` | +| ERROR | `onError(errorMsg)` | +| SESSION_FINISHED | 日志记录 | + +### 3.6 后端选择逻辑 + +文件:`src/hooks/useVoice.ts` + +```ts +// 判断当前 provider +isDoubaoProvider() → 读取 settings.voiceProvider + +// handleKeyEvent 中的可用性检查 +const sttAvailable = isDoubaoProvider() + ? isDoubaoAvailableSync() // 乐观检查(首次返回 true) + : isVoiceStreamAvailable() // Anthropic WebSocket 检查 + +// attemptConnect 中的连接函数选择 +const connectFn = isDoubaoProvider() + ? connectDoubaoStream + : connectVoiceStream +``` + +豆包后端的特殊处理: +- 跳过 `getVoiceKeyterms()` 调用(豆包无需关键词提示) +- 跳过 Focus Mode(`if (!enabled || !focusMode || isDoubaoProvider())`) ## 四、关键设计决策 -1. **OAuth 独占**:语音模式使用 `voice_stream` 端点(claude.ai),仅 Anthropic OAuth 用户可用。API key、Bedrock、Vertex 用户无法使用 -2. **GrowthBook 负向门控**:`tengu_amber_quartz_disabled` 默认 `false`,新安装自动可用(无需等 GrowthBook 初始化) -3. **Keychain 缓存**:`getClaudeAIOAuthTokens()` 首次调用访问 macOS keychain(~20-50ms),后续缓存命中 -4. **独立于主 feature flag**:`isVoiceGrowthBookEnabled()` 在 feature flag 关闭时短路返回 `false`,不触发任何模块加载 +1. **双后端共存**:豆包后端作为独立适配器与 Anthropic 后端并存,不替换原有流程,通过 `voiceProvider` 设置切换 +2. **设置持久化**:`voiceProvider` 存储在 `settings.json`,通过 `/voice` 命令修改,跨会话生效 +3. **OAuth 独占(Anthropic)**:Anthropic 后端使用 `voice_stream` 端点(claude.ai),仅 OAuth 用户可用 +4. **豆包无需 OAuth**:豆包后端使用独立凭证文件,不依赖 Anthropic 认证,通过 `isVoiceAvailable()` 放宽门控 +5. **GrowthBook 负向门控**:`tengu_amber_quartz_disabled` 默认 `false`,新安装自动可用 +6. **onReady 立即触发**:豆包后端在连接建立后立即触发 `onReady`,避免与 useVoice 音频缓冲的时序死锁(Anthropic 需要等待 WebSocket 握手) +7. **finalize() 立即返回**:豆包在录音过程中已返回所有结果,用户抬手时无需等待处理 +8. **乐观可用性检查**:`isDoubaoAvailableSync()` 在首次调用时返回 `true`,实际导入错误在 `connectDoubaoStream` 中处理 +9. **optionalDependencies**:`doubaoime-asr` 作为可选依赖,安装失败不影响 Anthropic 后端 ## 五、使用方式 @@ -100,26 +204,60 @@ WebSocket 连接到 Anthropic STT 端点 # 启用 feature FEATURE_VOICE_MODE=1 bun run dev -# 在 REPL 中使用 +# 在 REPL 中使用 Anthropic 后端 # 1. 确保已通过 OAuth 登录(claude.ai 订阅) -# 2. 按住空格键说话 -# 3. 释放空格键等待转录 -# 4. 或使用 /voice 命令切换开关 +# 2. 输入 /voice 启用 +# 3. 按住空格键说话 +# 4. 释放空格键等待转录 + +# 在 REPL 中使用豆包 ASR 后端 +# 1. 确保 doubaoime-asr 已安装(bun add doubaoime-asr) +# 2. 配置凭证文件:~/.claude/tts/doubao/credentials.json +# 3. 输入 /voice doubao 启用 +# 4. 按住空格键说话 +# 5. 释放空格键,转录结果即刻显示 + +# 切换后端 +/voice doubao # 切换到豆包 ASR +/voice anthropic # 切换回 Anthropic STT +/voice # 关闭语音模式 +``` + +### 豆包凭证配置 + +凭证文件路径:`~/.claude/tts/doubao/credentials.json` + +```json +{ + "deviceId": "...", + "installId": "...", + "cdid": "...", + "openudid": "...", + "clientudid": "...", + "token": "..." +} ``` ## 六、外部依赖 -| 依赖 | 说明 | -|------|------| -| Anthropic OAuth | claude.ai 订阅登录,非 API key | -| GrowthBook | `tengu_amber_quartz_disabled` 紧急关闭 | -| macOS 原生音频 或 SoX | 音频录制 | -| Nova 3 STT | 语音转文本模型 | +| 依赖 | 说明 | 适用后端 | +|------|------|----------| +| Anthropic OAuth | claude.ai 订阅登录,非 API key | Anthropic | +| GrowthBook | `tengu_amber_quartz_disabled` 紧急关闭 | 通用 | +| macOS 原生音频 或 SoX | 音频录制 | 通用 | +| Nova 3 STT | Anthropic 语音转文本模型 | Anthropic | +| doubaoime-asr | 豆包 ASR SDK(optionalDependencies) | 豆包 | +| 凭证文件 | `~/.claude/tts/doubao/credentials.json` | 豆包 | ## 七、文件索引 -| 文件 | 行数 | 职责 | -|------|------|------| -| `src/voice/voiceModeEnabled.ts` | 54 | 三层门控逻辑 | -| `src/hooks/useVoice.ts` | — | React hook(录音状态 + WebSocket) | -| `src/services/voiceStreamSTT.ts` | — | STT WebSocket 流式传输 | +| 文件 | 职责 | +|------|------| +| `src/voice/voiceModeEnabled.ts` | 三层门控逻辑 + `isVoiceAvailable()` | +| `src/hooks/useVoice.ts` | React hook(录音状态 + 后端选择 + 连接管理) | +| `src/hooks/useVoiceEnabled.ts` | 语音启用状态 hook(按 provider 决定 OAuth 检查) | +| `src/services/voiceStreamSTT.ts` | Anthropic STT WebSocket 流式传输 | +| `src/services/doubaoSTT.ts` | 豆包 ASR 适配器(AudioChunkQueue + connectDoubaoStream) | +| `src/commands/voice/voice.ts` | `/voice` 命令(开关 + 后端选择) | +| `src/commands/voice/index.ts` | 命令注册(去除 availability 限制) | +| `src/utils/settings/types.ts` | `voiceProvider` 类型定义 | diff --git a/package.json b/package.json index 1eadeed83..38e3d76bd 100644 --- a/package.json +++ b/package.json @@ -205,5 +205,8 @@ "xss": "^1.0.15", "yaml": "^2.8.3", "zod": "^4.3.6" + }, + "optionalDependencies": { + "doubaoime-asr": "^0.1.0" } } diff --git a/src/commands/voice/index.ts b/src/commands/voice/index.ts index 61540d3ba..213bf2c40 100644 --- a/src/commands/voice/index.ts +++ b/src/commands/voice/index.ts @@ -1,17 +1,15 @@ import type { Command } from '../../commands.js' import { - isVoiceGrowthBookEnabled, - isVoiceModeEnabled, + isVoiceAvailable, } from '../../voice/voiceModeEnabled.js' const voice = { type: 'local', name: 'voice', - description: 'Toggle voice mode', - availability: ['claude-ai'], - isEnabled: () => isVoiceGrowthBookEnabled(), + description: 'Toggle voice mode. Use /voice doubao for Doubao ASR backend', + isEnabled: () => isVoiceAvailable(), get isHidden() { - return !isVoiceModeEnabled() + return !isVoiceAvailable() }, supportsNonInteractive: false, load: () => import('./voice.js'), diff --git a/src/commands/voice/voice.ts b/src/commands/voice/voice.ts index f369891bb..95282ea45 100644 --- a/src/commands/voice/voice.ts +++ b/src/commands/voice/voice.ts @@ -2,29 +2,19 @@ import { normalizeLanguageForSTT } from '../../hooks/useVoice.js' import { getShortcutDisplay } from '../../keybindings/shortcutFormat.js' import { logEvent } from '../../services/analytics/index.js' import type { LocalCommandCall } from '../../types/command.js' -import { isAnthropicAuthEnabled } from '../../utils/auth.js' import { getGlobalConfig, saveGlobalConfig } from '../../utils/config.js' import { settingsChangeDetector } from '../../utils/settings/changeDetector.js' import { getInitialSettings, updateSettingsForSource, } from '../../utils/settings/settings.js' -import { isVoiceModeEnabled } from '../../voice/voiceModeEnabled.js' +import { isVoiceAvailable } from '../../voice/voiceModeEnabled.js' const LANG_HINT_MAX_SHOWS = 2 -export const call: LocalCommandCall = async () => { - // Check auth and kill-switch before allowing voice mode - if (!isVoiceModeEnabled()) { - // Differentiate: OAuth-less users get an auth hint, everyone else - // gets nothing (command shouldn't be reachable when the kill-switch is on). - if (!isAnthropicAuthEnabled()) { - return { - type: 'text' as const, - value: - 'Voice mode requires a Claude.ai account. Please run /login to sign in.', - } - } +export const call: LocalCommandCall = async (args) => { + // Check kill-switch before allowing voice mode + if (!isVoiceAvailable()) { return { type: 'text' as const, value: 'Voice mode is not available.', @@ -33,6 +23,47 @@ export const call: LocalCommandCall = async () => { const currentSettings = getInitialSettings() const isCurrentlyEnabled = currentSettings.voiceEnabled === true + const providerArg = args?.trim().toLowerCase() + + // Handle provider argument when already enabled — switch backend only + if (isCurrentlyEnabled && providerArg === 'doubao') { + const result = updateSettingsForSource('userSettings', { + voiceProvider: 'doubao', + }) + if (result.error) { + return { + type: 'text' as const, + value: + 'Failed to update settings. Check your settings file for syntax errors.', + } + } + settingsChangeDetector.notifyChange('userSettings') + const key = getShortcutDisplay('voice:pushToTalk', 'Chat', 'Space') + return { + type: 'text' as const, + value: `Voice mode switched to Doubao ASR. Hold ${key} to record.`, + } + } + + // Handle provider argument when already enabled — switch to anthropic + if (isCurrentlyEnabled && providerArg === 'anthropic') { + const result = updateSettingsForSource('userSettings', { + voiceProvider: 'anthropic', + }) + if (result.error) { + return { + type: 'text' as const, + value: + 'Failed to update settings. Check your settings file for syntax errors.', + } + } + settingsChangeDetector.notifyChange('userSettings') + const key = getShortcutDisplay('voice:pushToTalk', 'Chat', 'Space') + return { + type: 'text' as const, + value: `Voice mode switched to Anthropic STT. Hold ${key} to record.`, + } + } // Toggle OFF — no checks needed if (isCurrentlyEnabled) { @@ -54,7 +85,10 @@ export const call: LocalCommandCall = async () => { } } - // Toggle ON — run pre-flight checks first + // Toggle ON — determine provider from argument or default + const provider = providerArg === 'doubao' ? 'doubao' : 'anthropic' + + // Run pre-flight checks const { isVoiceStreamAvailable } = await import( '../../services/voiceStreamSTT.js' ) @@ -70,8 +104,8 @@ export const call: LocalCommandCall = async () => { } } - // Check for API key - if (!isVoiceStreamAvailable()) { + // Check for API key (only for Anthropic backend — Doubao uses its own credentials) + if (provider !== 'doubao' && !isVoiceStreamAvailable()) { return { type: 'text' as const, value: @@ -111,8 +145,11 @@ export const call: LocalCommandCall = async () => { } } - // All checks passed — enable voice - const result = updateSettingsForSource('userSettings', { voiceEnabled: true }) + // All checks passed — enable voice with provider + const result = updateSettingsForSource('userSettings', { + voiceEnabled: true, + ...(provider === 'doubao' ? { voiceProvider: 'doubao' } : {}), + }) if (result.error) { return { type: 'text' as const, @@ -123,28 +160,30 @@ export const call: LocalCommandCall = async () => { settingsChangeDetector.notifyChange('userSettings') logEvent('tengu_voice_toggled', { enabled: true }) const key = getShortcutDisplay('voice:pushToTalk', 'Chat', 'Space') - const stt = normalizeLanguageForSTT(currentSettings.language) - const cfg = getGlobalConfig() - // Reset the hint counter whenever the resolved STT language changes - // (including first-ever enable, where lastLanguage is undefined). - const langChanged = cfg.voiceLangHintLastLanguage !== stt.code - const priorCount = langChanged ? 0 : (cfg.voiceLangHintShownCount ?? 0) - const showHint = !stt.fellBackFrom && priorCount < LANG_HINT_MAX_SHOWS let langNote = '' - if (stt.fellBackFrom) { - langNote = ` Note: "${stt.fellBackFrom}" is not a supported dictation language; using English. Change it via /config.` - } else if (showHint) { - langNote = ` Dictation language: ${stt.code} (/config to change).` - } - if (langChanged || showHint) { - saveGlobalConfig(prev => ({ - ...prev, - voiceLangHintShownCount: priorCount + (showHint ? 1 : 0), - voiceLangHintLastLanguage: stt.code, - })) + const providerLabel = provider === 'doubao' ? 'Doubao ASR' : 'Anthropic' + // Doubao backend handles all languages natively — skip language hints + if (provider !== 'doubao') { + const stt = normalizeLanguageForSTT(currentSettings.language) + const cfg = getGlobalConfig() + const langChanged = cfg.voiceLangHintLastLanguage !== stt.code + const priorCount = langChanged ? 0 : (cfg.voiceLangHintShownCount ?? 0) + const showHint = !stt.fellBackFrom && priorCount < LANG_HINT_MAX_SHOWS + if (stt.fellBackFrom) { + langNote = ` Note: "${stt.fellBackFrom}" is not a supported dictation language; using English. Change it via /config.` + } else if (showHint) { + langNote = ` Dictation language: ${stt.code} (/config to change).` + } + if (langChanged || showHint) { + saveGlobalConfig(prev => ({ + ...prev, + voiceLangHintShownCount: priorCount + (showHint ? 1 : 0), + voiceLangHintLastLanguage: stt.code, + })) + } } return { type: 'text' as const, - value: `Voice mode enabled. Hold ${key} to record.${langNote}`, + value: `Voice mode enabled (${providerLabel}). Hold ${key} to record.${langNote}`, } } diff --git a/src/hooks/useVoice.ts b/src/hooks/useVoice.ts index 0ac154e37..48209476b 100644 --- a/src/hooks/useVoice.ts +++ b/src/hooks/useVoice.ts @@ -20,6 +20,10 @@ import { isVoiceStreamAvailable, type VoiceStreamConnection, } from '../services/voiceStreamSTT.js' +import { + connectDoubaoStream, + isDoubaoAvailableSync, +} from '../services/doubaoSTT.js' import { logForDebugging } from '../utils/debug.js' import { toError } from '../utils/errors.js' import { getSystemLocaleLanguage } from '../utils/intl.js' @@ -27,6 +31,10 @@ import { logError } from '../utils/log.js' import { getInitialSettings } from '../utils/settings/settings.js' import { sleep } from '../utils/sleep.js' +function isDoubaoProvider(): boolean { + return getInitialSettings().voiceProvider === 'doubao' +} + // ─── Language normalization ───────────────────────────────────────────── const DEFAULT_STT_LANGUAGE = 'en' @@ -574,7 +582,7 @@ export function useVoice({ // stop when it loses focus. This enables a "multi-clauding army" // workflow where voice input follows window focus. useEffect(() => { - if (!enabled || !focusMode) { + if (!enabled || !focusMode || isDoubaoProvider()) { // Focus mode was disabled while a focus-driven recording was active — // stop the recording so it doesn't linger until the silence timer fires. if (focusTriggeredRef.current && stateRef.current === 'recording') { @@ -778,7 +786,11 @@ export function useVoice({ const attemptConnect = (keyterms: string[]): void => { const myAttemptGen = attemptGenRef.current - void connectVoiceStream( + // Select STT backend based on settings.voiceProvider + const connectFn = isDoubaoProvider() + ? (cbs: Parameters[0], opts: Parameters[1]) => connectDoubaoStream(cbs, opts) + : (cbs: Parameters[0], opts: Parameters[1]) => connectVoiceStream(cbs, opts) + void connectFn( { onTranscript: (text: string, isFinal: boolean) => { if (isStale()) return @@ -1007,7 +1019,12 @@ export function useVoice({ }) } - void getVoiceKeyterms().then(attemptConnect) + // Doubao backend doesn't use keyterms — skip the async fetch + if (isDoubaoProvider()) { + attemptConnect([]) + } else { + void getVoiceKeyterms().then(attemptConnect) + } } // ── Hold-to-talk handler ──────────────────────────────────────────── @@ -1021,7 +1038,8 @@ export function useVoice({ // delay of ~500ms on macOS). const handleKeyEvent = useCallback( (fallbackMs = REPEAT_FALLBACK_MS): void => { - if (!enabled || !isVoiceStreamAvailable()) { + const sttAvailable = isDoubaoProvider() ? isDoubaoAvailableSync() : isVoiceStreamAvailable() + if (!enabled || !sttAvailable) { return } diff --git a/src/hooks/useVoiceEnabled.ts b/src/hooks/useVoiceEnabled.ts index ece06913f..3d5a9184e 100644 --- a/src/hooks/useVoiceEnabled.ts +++ b/src/hooks/useVoiceEnabled.ts @@ -7,19 +7,22 @@ import { /** * Combines user intent (settings.voiceEnabled) with auth + GB kill-switch. + * When using Doubao backend, auth check is skipped (Doubao has its own credentials). * Only the auth half is memoized on authVersion — it's the expensive one * (cold getClaudeAIOAuthTokens memoize → sync `security` spawn, ~60ms/call, * ~180ms total in profile v5 when token refresh cleared the cache mid-session). * GB is a cheap cached-map lookup and stays outside the memo so a mid-session * kill-switch flip still takes effect on the next render. - * - * authVersion bumps on /login only. Background token refresh leaves it alone - * (user is still authed), so the auth memo stays correct without re-eval. */ export function useVoiceEnabled(): boolean { const userIntent = useAppState(s => s.settings.voiceEnabled === true) + const provider = useAppState(s => s.settings.voiceProvider) + // All hooks must be called unconditionally (Rules of Hooks) const authVersion = useAppState(s => s.authVersion) // eslint-disable-next-line react-hooks/exhaustive-deps const authed = useMemo(hasVoiceAuth, [authVersion]) + if (provider === 'doubao') { + return userIntent && isVoiceGrowthBookEnabled() + } return userIntent && authed && isVoiceGrowthBookEnabled() } diff --git a/src/services/doubaoSTT.ts b/src/services/doubaoSTT.ts new file mode 100644 index 000000000..139ea67e1 --- /dev/null +++ b/src/services/doubaoSTT.ts @@ -0,0 +1,230 @@ +// Doubao (豆包) ASR speech-to-text adapter for voice mode. +// +// Wraps the doubaoime-asr npm package to expose the same interface as +// voiceStreamSTT.ts. The doubao backend uses an AsyncGenerator-based +// streaming protocol internally; this adapter bridges it to the +// send/finalize/close pattern used by useVoice.ts. + +import { homedir } from 'node:os' +import type { ASRResponse } from 'doubaoime-asr' +import type { FinalizeSource, VoiceStreamCallbacks, VoiceStreamConnection } from './voiceStreamSTT.js' +import { logForDebugging } from '../utils/debug.js' +import { logError } from '../utils/log.js' + +// Re-export FinalizeSource so useVoice can import from either module +export type { FinalizeSource } from './voiceStreamSTT.js' + +// Maximum time to wait for the generator to finish after end-of-stream signal. +const FINALIZE_SAFETY_TIMEOUT_MS = 5_000 + +// ─── AsyncIterable audio queue ───────────────────────────────────────── + +// A push-based queue that implements AsyncIterable. +// send() pushes chunks; push(null) signals end-of-stream. +class AudioChunkQueue { + private chunks: (Uint8Array | null)[] = [] + private waiting: ((result: IteratorResult) => void) | null = null + private done = false + + push(chunk: Uint8Array | null): void { + if (this.done) return + if (chunk === null) { + this.done = true + if (this.waiting) { + const resolve = this.waiting + this.waiting = null + resolve({ value: undefined, done: true }) + } + return + } + if (this.waiting) { + const resolve = this.waiting + this.waiting = null + resolve({ value: chunk, done: false }) + } else { + this.chunks.push(chunk) + } + } + + abort(): void { + this.done = true + this.chunks.length = 0 + if (this.waiting) { + const resolve = this.waiting + this.waiting = null + resolve({ value: undefined, done: true }) + } + } + + [Symbol.asyncIterator](): AsyncIterator { + return { + next: async (): Promise> => { + if (this.chunks.length > 0) { + const chunk = this.chunks.shift()! + return { value: chunk, done: false } + } + if (this.done) { + return { value: undefined, done: true } + } + return new Promise>((resolve) => { + this.waiting = resolve + }) + }, + } + } +} + +// ─── Availability ──────────────────────────────────────────────────────── + +let doubaoAvailable: boolean | null = null + +export async function isDoubaoAvailable(): Promise { + if (doubaoAvailable !== null) return doubaoAvailable + try { + await import('doubaoime-asr') + doubaoAvailable = true + } catch { + doubaoAvailable = false + } + return doubaoAvailable +} + +// Synchronous check — returns cached result or optimistic true when +// VOICE_PROVIDER=doubao is set and no cached result exists yet. +// The actual import happens in connectDoubaoStream which reports errors. +export function isDoubaoAvailableSync(): boolean { + if (doubaoAvailable !== null) return doubaoAvailable + return true +} + +// ─── Connection ────────────────────────────────────────────────────────── + +export async function connectDoubaoStream( + callbacks: VoiceStreamCallbacks, + _options?: { language?: string }, +): Promise { + let doubaoAsr: typeof import('doubaoime-asr') + try { + doubaoAsr = await import('doubaoime-asr') + } catch { + logError(new Error('[doubao-asr] Failed to import doubaoime-asr package')) + callbacks.onError('doubaoime-asr package is not installed. Install it with: bun add doubaoime-asr', { fatal: true }) + return null + } + + const { transcribeRealtime, ASRConfig, ResponseType } = doubaoAsr + + const queue = new AudioChunkQueue() + let finalized = false + + // Resolve handle for finalize() promise — wrapped in an object to avoid + // TypeScript closure-scope type narrowing issues (TS2349 "not callable"). + const finalizeHandle: { resolve: ((source: FinalizeSource) => void) | null } = { resolve: null } + + const connection: VoiceStreamConnection = { + send(audioChunk: Buffer): void { + if (finalized) return + queue.push(new Uint8Array(audioChunk.buffer, audioChunk.byteOffset, audioChunk.byteLength)) + }, + finalize(): Promise { + if (finalized) return Promise.resolve('ws_already_closed') + finalized = true + queue.push(null) // signal end-of-stream to the generator + // Doubao returns FINAL_RESULT during recording — by the time the user + // releases the key, all transcripts are already in accumulatedRef. + // Resolve immediately so the UI skips the 'processing' state and goes + // straight to displaying the result. + logForDebugging('[doubao-asr] Finalize — resolving immediately') + return Promise.resolve('post_closestream_endpoint') + }, + close(): void { + finalized = true + queue.abort() + const r = finalizeHandle.resolve + finalizeHandle.resolve = null + if (r) r('ws_close') + callbacks.onClose() + }, + isConnected(): boolean { + return true + }, + } + + // Start the ASR session in the background + const config = new ASRConfig({ credentialPath: `${homedir()}/.claude/tts/doubao/credentials.json` }) + + // Ensure credentials are initialized (may auto-generate) + try { + await config.ensureCredentials() + } catch (err) { + logError(new Error(`[doubao-asr] Credential initialization failed: ${String(err)}`)) + callbacks.onError(`Doubao ASR 凭证初始化失败: ${String(err)}`, { fatal: true }) + return null + } + + // Fire onReady immediately — unlike the Anthropic WebSocket which needs to + // wait for a handshake, the doubao backend accepts audio through the queue + // and handles connection internally. The caller (useVoice.ts) needs onReady + // to fire before it will route audio chunks via connection.send(). + logForDebugging('[doubao-asr] Firing onReady immediately') + callbacks.onReady(connection) + + // Consume the AsyncGenerator in the background + void (async () => { + try { + const audioSource: AsyncIterable = queue + const gen: AsyncGenerator = transcribeRealtime(audioSource, { config }) + + for await (const resp of gen) { + if (finalized && resp.type !== ResponseType.FINAL_RESULT && resp.type !== ResponseType.SESSION_FINISHED) { + continue + } + + switch (resp.type) { + case ResponseType.SESSION_STARTED: + logForDebugging('[doubao-asr] Session started') + break + case ResponseType.VAD_START: + logForDebugging('[doubao-asr] VAD detected speech start') + break + case ResponseType.INTERIM_RESULT: + if (resp.text) { + callbacks.onTranscript(resp.text, false) + } + break + case ResponseType.FINAL_RESULT: + if (resp.text) { + callbacks.onTranscript(resp.text, true) + } + break + case ResponseType.ERROR: + logError(new Error(`[doubao-asr] Error: ${resp.errorMsg}`)) + if (!finalized) { + callbacks.onError(resp.errorMsg || 'Doubao ASR 识别错误') + } + break + case ResponseType.SESSION_FINISHED: + logForDebugging('[doubao-asr] Session finished') + break + default: + break + } + } + + // Generator exhausted naturally + const r = finalizeHandle.resolve + finalizeHandle.resolve = null + if (r) r('post_closestream_endpoint') + } catch (err) { + logError(new Error(`[doubao-asr] Stream error: ${String(err)}`)) + if (!finalized) { + callbacks.onError(`Doubao ASR 连接错误: ${String(err)}`) + } + const r2 = finalizeHandle.resolve + finalizeHandle.resolve = null + if (r2) r2('ws_close') + } + })() + + return connection +} diff --git a/src/utils/settings/types.ts b/src/utils/settings/types.ts index 37c78ce3f..f715c568d 100644 --- a/src/utils/settings/types.ts +++ b/src/utils/settings/types.ts @@ -880,6 +880,10 @@ export const SettingsSchema = lazySchema(() => .boolean() .optional() .describe('Enable voice mode (hold-to-talk dictation)'), + voiceProvider: z + .enum(['anthropic', 'doubao']) + .optional() + .describe('Voice STT backend: "anthropic" (default) or "doubao" (Doubao ASR)'), } : {}), ...(feature('KAIROS') diff --git a/src/utils/suggestions/__tests__/commandSuggestions.test.ts b/src/utils/suggestions/__tests__/commandSuggestions.test.ts index db0da0757..e245dd179 100644 --- a/src/utils/suggestions/__tests__/commandSuggestions.test.ts +++ b/src/utils/suggestions/__tests__/commandSuggestions.test.ts @@ -23,7 +23,7 @@ function makeCommand(name: string, opts?: Partial): Command { type: 'local', handler: () => {}, ...opts, - } as Command + } as unknown as Command } function makePromptCommand( @@ -37,7 +37,7 @@ function makePromptCommand( handler: () => {}, source: 'userSettings', ...opts, - } as Command + } as unknown as Command } // ─── isCommandInput ─────────────────────────────────────────────────── diff --git a/src/voice/voiceModeEnabled.ts b/src/voice/voiceModeEnabled.ts index 1d8867c35..01955070e 100644 --- a/src/voice/voiceModeEnabled.ts +++ b/src/voice/voiceModeEnabled.ts @@ -44,11 +44,18 @@ export function hasVoiceAuth(): boolean { } /** - * Full runtime check: auth + GrowthBook kill-switch. Callers: `/voice` - * (voice.ts, voice/index.ts), ConfigTool, VoiceModeNotice — command-time - * paths where a fresh keychain read is acceptable. For React render - * paths use useVoiceEnabled() instead (memoizes the auth half). + * Full runtime check for Anthropic voice_stream backend. + * Returns true when both auth + GrowthBook kill-switch pass. */ export function isVoiceModeEnabled(): boolean { return hasVoiceAuth() && isVoiceGrowthBookEnabled() } + +/** + * Check if voice mode can be activated with any STT backend. + * Always returns true when VOICE_MODE feature flag is on and GrowthBook + * kill-switch is off — the Doubao backend does not require Anthropic auth. + */ +export function isVoiceAvailable(): boolean { + return isVoiceGrowthBookEnabled() +}