Merge branch 'pr/amDosion/92'

2026-06-15 12:55:51 +00:00 · 2026-04-04 00:12:15 +08:00
parent 52d8b83b24 7ae94327fb
commit 131465097f
11 changed files with 318 additions and 136 deletions
--- a/DEV-LOG.md
+++ b/DEV-LOG.md
@@ -1,5 +1,35 @@
 # DEV-LOG

+## Enable Voice Mode / VOICE_MODE (2026-04-03)
+
+恢复 `/voice` 语音输入功能。`src/` 下所有 voice 相关源码已与官方一致（0 行差异），问题出在：① `VOICE_MODE` 编译开关未开，命令不显示；② `audio-capture-napi` 是 SoX 子进程 stub（Windows 不支持），缺少官方原生 `.node` 二进制。
+
+**新增文件：**
+
+| 文件 | 说明 |
+|------|------|
+| `vendor/audio-capture/{platform}/audio-capture.node` | 6 个平台的原生音频二进制（cpal，来自参考项目） |
+| `vendor/audio-capture-src/index.ts` | 原生模块加载器（按 `${arch}-${platform}` 动态 require `.node`） |
+
+**修改文件：**
+
+| 文件 | 变更 |
+|------|------|
+| `packages/audio-capture-napi/src/index.ts` | SoX 子进程 stub → 原生 `.node` 加载器（含 `process.cwd()` workspace 路径 fallback） |
+| `scripts/dev.ts` | `DEFAULT_FEATURES` 加 `"VOICE_MODE"` |
+| `build.ts` | `DEFAULT_BUILD_FEATURES` 加 `"VOICE_MODE"` |
+| `docs/features/voice-mode.md` | 追加恢复计划章节（第八节） |
+
+**验证结果：**
+
+- `isNativeAudioAvailable()` → `true`（Windows x64 原生 `.node` 加载成功）
+- `feature('VOICE_MODE')` → `ENABLED`
+- `bun run build` → voice 代码编入产物
+
+**运行时前置条件：** claude.ai OAuth 登录 + 麦克风权限
+
+---
+
 ## Enable Claude in Chrome MCP (2026-04-03)

 恢复 Chrome 浏览器控制功能。`src/` 下所有 claudeInChrome 相关源码已与官方一致（0 行差异），问题出在 `@ant/claude-for-chrome-mcp` 包是 6 行 stub（返回空工具列表和 null server）。
--- a/build.ts
+++ b/build.ts
@@ -10,7 +10,7 @@ rmSync(outdir, { recursive: true, force: true });

 // Default features that match the official CLI build.
 // Additional features can be enabled via FEATURE_<NAME>=1 env vars.
-const DEFAULT_BUILD_FEATURES = ["AGENT_TRIGGERS_REMOTE"];
+const DEFAULT_BUILD_FEATURES = ["AGENT_TRIGGERS_REMOTE", "VOICE_MODE"];

 // Collect FEATURE_* env vars → Bun.build features
 const envFeatures = Object.keys(process.env)
--- a/packages/audio-capture-napi/src/index.ts
+++ b/packages/audio-capture-napi/src/index.ts
@@ -1,151 +1,152 @@
-// audio-capture-napi: cross-platform audio capture using SoX (rec) on macOS
-// and arecord (ALSA) on Linux. Replaces the original cpal-based native module.

-import { type ChildProcess, spawn, spawnSync } from 'child_process'
-
-// ─── State ───────────────────────────────────────────────────────────
-
-let recordingProcess: ChildProcess | null = null
-let availabilityCache: boolean | null = null
-
-// ─── Helpers ─────────────────────────────────────────────────────────
-
-function commandExists(cmd: string): boolean {
-  const result = spawnSync(cmd, ['--version'], {
-    stdio: 'ignore',
-    timeout: 3000,
-  })
-  return result.error === undefined
+type AudioCaptureNapi = {
+  startRecording(
+    onData: (data: Buffer) => void,
+    onEnd: () => void,
+  ): boolean
+  stopRecording(): void
+  isRecording(): boolean
+  startPlayback(sampleRate: number, channels: number): boolean
+  writePlaybackData(data: Buffer): void
+  stopPlayback(): void
+  isPlaying(): boolean
+  // TCC microphone authorization status (macOS only):
+  // 0 = notDetermined, 1 = restricted, 2 = denied, 3 = authorized.
+  // Linux: always returns 3 (authorized) — no system-level microphone permission API.
+  // Windows: returns 3 (authorized) if registry key absent or allowed,
+  //          2 (denied) if microphone access is explicitly denied.
+  microphoneAuthorizationStatus?(): number
 }

-// ─── Public API ──────────────────────────────────────────────────────
+let cachedModule: AudioCaptureNapi | null = null
+let loadAttempted = false

-/**
- * Check whether a supported audio recording command is available.
- * Returns true if `rec` (SoX) is found on macOS, or `arecord` (ALSA) on Linux.
- * Windows is not supported and always returns false.
- */
-export function isNativeAudioAvailable(): boolean {
-  if (availabilityCache !== null) {
-    return availabilityCache
+function loadModule(): AudioCaptureNapi | null {
+  if (loadAttempted) {
+    return cachedModule
+  }
+  loadAttempted = true
+
+  // Supported platforms: macOS (darwin), Linux, Windows (win32)
+  const platform = process.platform
+  if (platform !== 'darwin' && platform !== 'linux' && platform !== 'win32') {
+    return null
  }

-  if (process.platform === 'win32') {
-    availabilityCache = false
-    return false
-  }
-
-  if (process.platform === 'darwin') {
-    // macOS: use SoX rec
-    availabilityCache = commandExists('rec')
-    return availabilityCache
-  }
-
-  if (process.platform === 'linux') {
-    // Linux: prefer arecord, fall back to rec
-    availabilityCache = commandExists('arecord') || commandExists('rec')
-    return availabilityCache
-  }
-
-  availabilityCache = false
-  return false
-}
-
-/**
- * Check whether a recording is currently in progress.
- */
-export function isNativeRecordingActive(): boolean {
-  return recordingProcess !== null && !recordingProcess.killed
-}
-
-/**
- * Stop the active recording process, if any.
- */
-export function stopNativeRecording(): void {
-  if (recordingProcess) {
-    const proc = recordingProcess
-    recordingProcess = null
-    if (!proc.killed) {
-      proc.kill('SIGTERM')
+  // Candidate 1: native-embed path (bun compile). AUDIO_CAPTURE_NODE_PATH is
+  // defined at build time in build-with-plugins.ts for native builds only — the
+  // define resolves it to the static literal "../../audio-capture.node" so bun
+  // compile can rewrite it to /$bunfs/root/audio-capture.node. MUST stay a
+  // direct require(env var) — bun cannot analyze require(variable) from a loop.
+  if (process.env.AUDIO_CAPTURE_NODE_PATH) {
+    try {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      cachedModule = require(
+        process.env.AUDIO_CAPTURE_NODE_PATH,
+      ) as AudioCaptureNapi
+      return cachedModule
+    } catch {
+      // fall through to runtime fallbacks below
    }
  }
+
+  // Candidates 2-4: npm-install, dev/source, and workspace layouts.
+  // In bundled output, require() resolves relative to cli.js at the package root.
+  // In dev, it resolves relative to this file. When loaded from a workspace
+  // package (packages/audio-capture-napi/src/), we need an absolute path fallback.
+  const platformDir = `${process.arch}-${platform}`
+  const fallbacks = [
+    `./vendor/audio-capture/${platformDir}/audio-capture.node`,
+    `../audio-capture/${platformDir}/audio-capture.node`,
+    `${process.cwd()}/vendor/audio-capture/${platformDir}/audio-capture.node`,
+  ]
+  for (const p of fallbacks) {
+    try {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      cachedModule = require(p) as AudioCaptureNapi
+      return cachedModule
+    } catch {
+      // try next
+    }
+  }
+  return null
+}
+
+export function isNativeAudioAvailable(): boolean {
+  return loadModule() !== null
 }

-/**
- * Start recording audio. Raw PCM data (16kHz, 16-bit signed, mono) is
- * streamed via the onData callback. onEnd is called when recording stops
- * (either from silence detection or process termination).
- *
- * Returns true if recording started successfully, false otherwise.
- */
 export function startNativeRecording(
  onData: (data: Buffer) => void,
  onEnd: () => void,
 ): boolean {
-  // Don't start if already recording
-  if (isNativeRecordingActive()) {
-    stopNativeRecording()
-  }
-
-  if (!isNativeAudioAvailable()) {
+  const mod = loadModule()
+  if (!mod) {
    return false
  }
-
-  let child: ChildProcess
-
-  if (process.platform === 'darwin' || (process.platform === 'linux' && commandExists('rec'))) {
-    // Use SoX rec: output raw PCM 16kHz 16-bit signed mono to stdout
-    child = spawn(
-      'rec',
-      [
-        '-q',           // quiet
-        '--buffer',
-        '1024',         // small buffer for low latency
-        '-t', 'raw',    // raw PCM output
-        '-r', '16000',  // 16kHz sample rate
-        '-e', 'signed', // signed integer encoding
-        '-b', '16',     // 16-bit
-        '-c', '1',      // mono
-        '-',            // output to stdout
-      ],
-      { stdio: ['pipe', 'pipe', 'pipe'] },
-    )
-  } else if (process.platform === 'linux' && commandExists('arecord')) {
-    // Use arecord: output raw PCM 16kHz 16-bit signed LE mono to stdout
-    child = spawn(
-      'arecord',
-      [
-        '-f', 'S16_LE', // signed 16-bit little-endian
-        '-r', '16000',  // 16kHz sample rate
-        '-c', '1',      // mono
-        '-t', 'raw',    // raw PCM, no header
-        '-q',           // quiet
-        '-',            // output to stdout
-      ],
-      { stdio: ['pipe', 'pipe', 'pipe'] },
-    )
-  } else {
-    return false
-  }
-
-  recordingProcess = child
-
-  child.stdout?.on('data', (chunk: Buffer) => {
-    onData(chunk)
-  })
-
-  // Consume stderr to prevent backpressure
-  child.stderr?.on('data', () => {})
-
-  child.on('close', () => {
-    recordingProcess = null
-    onEnd()
-  })
-
-  child.on('error', () => {
-    recordingProcess = null
-    onEnd()
-  })
-
-  return true
+  return mod.startRecording(onData, onEnd)
+}
+
+export function stopNativeRecording(): void {
+  const mod = loadModule()
+  if (!mod) {
+    return
+  }
+  mod.stopRecording()
+}
+
+export function isNativeRecordingActive(): boolean {
+  const mod = loadModule()
+  if (!mod) {
+    return false
+  }
+  return mod.isRecording()
+}
+
+export function startNativePlayback(
+  sampleRate: number,
+  channels: number,
+): boolean {
+  const mod = loadModule()
+  if (!mod) {
+    return false
+  }
+  return mod.startPlayback(sampleRate, channels)
+}
+
+export function writeNativePlaybackData(data: Buffer): void {
+  const mod = loadModule()
+  if (!mod) {
+    return
+  }
+  mod.writePlaybackData(data)
+}
+
+export function stopNativePlayback(): void {
+  const mod = loadModule()
+  if (!mod) {
+    return
+  }
+  mod.stopPlayback()
+}
+
+export function isNativePlaying(): boolean {
+  const mod = loadModule()
+  if (!mod) {
+    return false
+  }
+  return mod.isPlaying()
+}
+
+// Returns the microphone authorization status.
+// On macOS, returns the TCC status: 0=notDetermined, 1=restricted, 2=denied, 3=authorized.
+// On Linux, always returns 3 (authorized) — no system-level mic permission API.
+// On Windows, returns 3 (authorized) if registry key absent or allowed, 2 (denied) if explicitly denied.
+// Returns 0 (notDetermined) if the native module is unavailable.
+export function microphoneAuthorizationStatus(): number {
+  const mod = loadModule()
+  if (!mod || !mod.microphoneAuthorizationStatus) {
+    return 0
+  }
+  return mod.microphoneAuthorizationStatus()
 }
--- a/scripts/dev.ts
+++ b/scripts/dev.ts
@@ -15,7 +15,7 @@ const defineArgs = Object.entries(defines).flatMap(([k, v]) => [

 // Bun --feature flags: enable feature() gates at runtime.
 // Default features enabled in dev mode.
-const DEFAULT_FEATURES = ["BUDDY", "TRANSCRIPT_CLASSIFIER", "BRIDGE_MODE", "AGENT_TRIGGERS_REMOTE"];
+const DEFAULT_FEATURES = ["BUDDY", "TRANSCRIPT_CLASSIFIER", "BRIDGE_MODE", "AGENT_TRIGGERS_REMOTE", "VOICE_MODE"];

 // Any env var matching FEATURE_<NAME>=1 will also enable that feature.
 // e.g. FEATURE_PROACTIVE=1 bun run dev
--- a/vendor/audio-capture-src/index.ts
+++ b/vendor/audio-capture-src/index.ts
@@ -0,0 +1,151 @@
+
+type AudioCaptureNapi = {
+  startRecording(
+    onData: (data: Buffer) => void,
+    onEnd: () => void,
+  ): boolean
+  stopRecording(): void
+  isRecording(): boolean
+  startPlayback(sampleRate: number, channels: number): boolean
+  writePlaybackData(data: Buffer): void
+  stopPlayback(): void
+  isPlaying(): boolean
+  // TCC microphone authorization status (macOS only):
+  // 0 = notDetermined, 1 = restricted, 2 = denied, 3 = authorized.
+  // Linux: always returns 3 (authorized) — no system-level microphone permission API.
+  // Windows: returns 3 (authorized) if registry key absent or allowed,
+  //          2 (denied) if microphone access is explicitly denied.
+  microphoneAuthorizationStatus?(): number
+}
+
+let cachedModule: AudioCaptureNapi | null = null
+let loadAttempted = false
+
+function loadModule(): AudioCaptureNapi | null {
+  if (loadAttempted) {
+    return cachedModule
+  }
+  loadAttempted = true
+
+  // Supported platforms: macOS (darwin), Linux, Windows (win32)
+  const platform = process.platform
+  if (platform !== 'darwin' && platform !== 'linux' && platform !== 'win32') {
+    return null
+  }
+
+  // Candidate 1: native-embed path (bun compile). AUDIO_CAPTURE_NODE_PATH is
+  // defined at build time in build-with-plugins.ts for native builds only — the
+  // define resolves it to the static literal "../../audio-capture.node" so bun
+  // compile can rewrite it to /$bunfs/root/audio-capture.node. MUST stay a
+  // direct require(env var) — bun cannot analyze require(variable) from a loop.
+  if (process.env.AUDIO_CAPTURE_NODE_PATH) {
+    try {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      cachedModule = require(
+        process.env.AUDIO_CAPTURE_NODE_PATH,
+      ) as AudioCaptureNapi
+      return cachedModule
+    } catch {
+      // fall through to runtime fallbacks below
+    }
+  }
+
+  // Candidates 2/3: npm-install and dev/source layouts. Dynamic require is
+  // fine here — in bundled output (node --target build) require() resolves at
+  // runtime relative to cli.js at the package root; in dev it resolves
+  // relative to this file (vendor/audio-capture-src/index.ts).
+  const platformDir = `${process.arch}-${platform}`
+  const fallbacks = [
+    `./vendor/audio-capture/${platformDir}/audio-capture.node`,
+    `../audio-capture/${platformDir}/audio-capture.node`,
+  ]
+  for (const p of fallbacks) {
+    try {
+      // eslint-disable-next-line @typescript-eslint/no-require-imports
+      cachedModule = require(p) as AudioCaptureNapi
+      return cachedModule
+    } catch {
+      // try next
+    }
+  }
+  return null
+}
+
+export function isNativeAudioAvailable(): boolean {
+  return loadModule() !== null
+}
+
+export function startNativeRecording(
+  onData: (data: Buffer) => void,
+  onEnd: () => void,
+): boolean {
+  const mod = loadModule()
+  if (!mod) {
+    return false
+  }
+  return mod.startRecording(onData, onEnd)
+}
+
+export function stopNativeRecording(): void {
+  const mod = loadModule()
+  if (!mod) {
+    return
+  }
+  mod.stopRecording()
+}
+
+export function isNativeRecordingActive(): boolean {
+  const mod = loadModule()
+  if (!mod) {
+    return false
+  }
+  return mod.isRecording()
+}
+
+export function startNativePlayback(
+  sampleRate: number,
+  channels: number,
+): boolean {
+  const mod = loadModule()
+  if (!mod) {
+    return false
+  }
+  return mod.startPlayback(sampleRate, channels)
+}
+
+export function writeNativePlaybackData(data: Buffer): void {
+  const mod = loadModule()
+  if (!mod) {
+    return
+  }
+  mod.writePlaybackData(data)
+}
+
+export function stopNativePlayback(): void {
+  const mod = loadModule()
+  if (!mod) {
+    return
+  }
+  mod.stopPlayback()
+}
+
+export function isNativePlaying(): boolean {
+  const mod = loadModule()
+  if (!mod) {
+    return false
+  }
+  return mod.isPlaying()
+}
+
+// Returns the microphone authorization status.
+// On macOS, returns the TCC status: 0=notDetermined, 1=restricted, 2=denied, 3=authorized.
+// On Linux, always returns 3 (authorized) — no system-level mic permission API.
+// On Windows, returns 3 (authorized) if registry key absent or allowed, 2 (denied) if explicitly denied.
+// Returns 0 (notDetermined) if the native module is unavailable.
+export function microphoneAuthorizationStatus(): number {
+  const mod = loadModule()
+  if (!mod || !mod.microphoneAuthorizationStatus) {
+    return 0
+  }
+  return mod.microphoneAuthorizationStatus()
+}
--- a/vendor/audio-capture/arm64-darwin/audio-capture.node
+++ b/vendor/audio-capture/arm64-darwin/audio-capture.node
--- a/vendor/audio-capture/arm64-linux/audio-capture.node
+++ b/vendor/audio-capture/arm64-linux/audio-capture.node
--- a/vendor/audio-capture/arm64-win32/audio-capture.node
+++ b/vendor/audio-capture/arm64-win32/audio-capture.node
--- a/vendor/audio-capture/x64-darwin/audio-capture.node
+++ b/vendor/audio-capture/x64-darwin/audio-capture.node
--- a/vendor/audio-capture/x64-linux/audio-capture.node
+++ b/vendor/audio-capture/x64-linux/audio-capture.node
--- a/vendor/audio-capture/x64-win32/audio-capture.node
+++ b/vendor/audio-capture/x64-win32/audio-capture.node