Files
claude-code/src/hooks/useVoiceIntegration.tsx
Bonerush 8ba51edec1 fix: 修复条件式 hook 调用导致的 "Rendered fewer hooks than expected" 错误
修复在 dev 模式下按下 Ctrl+O 切换 transcript 视图时 React 抛出
"Rendered fewer hooks than expected" 崩溃的问题。

## 根因分析

项目中有大量 hook(useState / useMemo / useRef / useSyncExternalStore 等)
被包裹在 `feature()` 三元表达式中条件调用,例如:

    const value = feature('X') ? useHook() : defaultValue;

在 build 模式下 `feature()` 是编译时常量,死代码消除会移除未使用的分支,
hooks 数量在编译后是确定的。但在 dev 模式下(scripts/dev.ts 注入
--feature 启用全部 31 个 feature),`feature()` 是运行时调用,
但始终返回 true,因此所有 hooks 都会被调用,原本不会出问题。

真正的触发器是 REPL.tsx 第 5381 行的提前返回:

    if (screen === 'transcript') { return transcriptReturn; }

当用户按下 Ctrl+O 进入 transcript 模式时,该提前返回之后的所有 hooks
(如 displayedAgentMessages 的 useMemo)都不会被调用,导致 React 在
下一次渲染时检测到 hooks 数量与上次不一致而崩溃。

此外,其他文件中也存在相同的条件式 hook 模式——虽然 dev 模式下
feature() 返回 true,所以这些路径实际上不会被触发,但它们是
潜在的隐患:若将来有人通过环境变量关闭某个 feature,
同样的崩溃会立即出现。

## 修复策略

采用统一模式:**始终无条件调用 hook,将 feature() gate 应用到值上**。

    // Before (unsafe — hook count varies by feature flag)
    const value = feature('X') ? useHook() : defaultValue;

    // After (safe — hook always called, gate on the value)
    const rawValue = useHook();
    const value = feature('X') ? rawValue : defaultValue;

## 修改清单

### 核心修复(REPL.tsx)
- 将 `displayedAgentMessages` useMemo 及依赖变量(viewedTask /
  viewedTeammateTask / viewedAgentTask / usesSyncMessages /
  rawAgentMessages / displayedMessages)从 transcript 提前返回
  之后移至之前,确保两模式下 hooks 调用顺序一致
- 修复 `disableMessageActions` / `useAssistantHistory` /
  `voiceIntegration` 的条件式 hook 调用

### 条件式 hook 修复(11 个文件)
- src/hooks/useGlobalKeybindings.tsx — isBriefOnly / toggleBrief
  keybinding 改为 isActive 门控
- src/hooks/useReplBridge.tsx — 5 个 BRIDGE_MODE 选值改为无条件调用
- src/hooks/useVoiceIntegration.tsx — 4 个 VOICE_MODE 选值修复
- src/components/PromptInput/Notifications.tsx — 4 个 feature 选值修复
- src/components/PromptInput/PromptInput.tsx — briefOwnsGap /
  companionSpeaking 修复
- src/components/PromptInput/PromptInputFooterLeftSide.tsx — 4 个
  VOICE_MODE 选值修复
- src/components/PromptInput/PromptInputQueuedCommands.tsx — isBriefOnly
- src/components/Spinner.tsx — briefEnvEnabled 修复
- src/components/TextInput.tsx — voiceState / audioLevels /
  animationFrame 修复
- src/components/messages/AttachmentMessage.tsx — isDemoEnv 修复
- src/components/messages/UserPromptMessage.tsx — isBriefOnly /
  viewingAgentTaskId / briefEnvEnabled 修复
- src/components/messages/UserToolResultMessage/UserToolSuccessMessage.tsx
  — isBriefOnly 修复

### 其他修复
- src/components/FeedbackSurvey/useFrustrationDetection.ts — 将 3 个
  提前返回合并为 shouldSkip 变量,handleTranscriptSelect 提前 return
- src/hooks/useIssueFlagBanner.ts — useRef 移到 USER_TYPE 检查之前
- src/hooks/useUpdateNotification.ts — useState 改为 useRef,
  避免版本号变化触发不必要重渲染

### 构建/开发配置
- build.ts — 添加 `sourcemap: 'linked'`
- scripts/dev.ts — NODE_ENV 从 'production' 改为 'development'

Closes #434
2026-05-08 13:17:25 +08:00

680 lines
31 KiB
TypeScript

import { feature } from 'bun:bundle';
import * as React from 'react';
import { useCallback, useEffect, useMemo, useRef } from 'react';
import { useNotifications } from '../context/notifications.js';
import { useIsModalOverlayActive } from '../context/overlayContext.js';
import { useGetVoiceState, useSetVoiceState, useVoiceState } from '../context/voice.js';
import { KeyboardEvent, useInput } from '@anthropic/ink';
// backward-compat bridge until REPL wires handleKeyDown to <Box onKeyDown>
import { useOptionalKeybindingContext } from '../keybindings/KeybindingContext.js';
import { keystrokesEqual } from '../keybindings/resolver.js';
import type { ParsedKeystroke } from '../keybindings/types.js';
import { normalizeFullWidthSpace } from '../utils/stringUtils.js';
import { useVoiceEnabled } from './useVoiceEnabled.js';
// Dead code elimination: conditional import for voice input hook.
/* eslint-disable @typescript-eslint/no-require-imports */
// Capture the module namespace, not the function: spyOn() mutates the module
// object, so `voiceNs.useVoice(...)` resolves to the spy even if this module
// was loaded before the spy was installed (test ordering independence).
const voiceNs: { useVoice: typeof import('./useVoice.js').useVoice } = feature('VOICE_MODE')
? require('./useVoice.js')
: {
useVoice: ({ enabled: _e }: { onTranscript: (t: string) => void; enabled: boolean }) => ({
state: 'idle' as const,
handleKeyEvent: (_fallbackMs?: number) => {},
}),
};
/* eslint-enable @typescript-eslint/no-require-imports */
// Maximum gap (ms) between key presses to count as held (auto-repeat).
// Terminal auto-repeat fires every 30-80ms; 120ms covers jitter while
// excluding normal typing speed (100-300ms between keystrokes).
const RAPID_KEY_GAP_MS = 120;
// Fallback (ms) for modifier-combo first-press activation. Must match
// FIRST_PRESS_FALLBACK_MS in useVoice.ts. Covers the max OS initial
// key-repeat delay (~2s on macOS with slider at "Long") so holding a
// modifier combo doesn't fragment into two sessions when the first
// auto-repeat arrives after the default 600ms REPEAT_FALLBACK_MS.
const MODIFIER_FIRST_PRESS_FALLBACK_MS = 2000;
// Number of rapid consecutive key events required to activate voice.
// Only applies to bare-char bindings (space, v, etc.) where a single press
// could be normal typing. Modifier combos activate on the first press.
const HOLD_THRESHOLD = 5;
// Number of rapid key events to start showing warmup feedback.
const WARMUP_THRESHOLD = 2;
// Match a KeyboardEvent against a ParsedKeystroke. Replaces the legacy
// matchesKeystroke(input, Key, ...) path which assumed useInput's raw
// `input` arg — KeyboardEvent.key holds normalized names (e.g. 'space',
// 'f9') that getKeyName() didn't handle, so modifier combos and f-keys
// silently failed to match after the onKeyDown migration (#23524).
function matchesKeyboardEvent(e: KeyboardEvent, target: ParsedKeystroke): boolean {
// KeyboardEvent stores key names; ParsedKeystroke stores ' ' for space
// and 'enter' for return (see parser.ts case 'space'/'return').
const key = e.key === 'space' ? ' ' : e.key === 'return' ? 'enter' : e.key.toLowerCase();
if (key !== target.key) return false;
if (e.ctrl !== target.ctrl) return false;
if (e.shift !== target.shift) return false;
// KeyboardEvent.meta folds alt|option (terminal limitation — esc-prefix);
// ParsedKeystroke has both alt and meta as aliases for the same thing.
if (e.meta !== (target.alt || target.meta)) return false;
if (e.superKey !== target.super) return false;
return true;
}
// Hardcoded default for when there's no KeybindingProvider at all (e.g.
// headless/test contexts). NOT used when the provider exists and the
// lookup returns null — that means the user null-unbound or reassigned
// space, and falling back to space would pick a dead or conflicting key.
const DEFAULT_VOICE_KEYSTROKE: ParsedKeystroke = {
key: ' ',
ctrl: false,
alt: false,
shift: false,
meta: false,
super: false,
};
type InsertTextHandle = {
insert: (text: string) => void;
setInputWithCursor: (value: string, cursor: number) => void;
cursorOffset: number;
};
type UseVoiceIntegrationArgs = {
setInputValueRaw: React.Dispatch<React.SetStateAction<string>>;
inputValueRef: React.RefObject<string>;
insertTextRef: React.RefObject<InsertTextHandle | null>;
};
type InterimRange = { start: number; end: number };
type StripOpts = {
// Which char to strip (the configured hold key). Defaults to space.
char?: string;
// Capture the voice prefix/suffix anchor at the stripped position.
anchor?: boolean;
// Minimum trailing count to leave behind — prevents stripping the
// intentional warmup chars when defensively cleaning up leaks.
floor?: number;
};
type UseVoiceIntegrationResult = {
// Returns the number of trailing chars remaining after stripping.
stripTrailing: (maxStrip: number, opts?: StripOpts) => number;
// Undo the gap space and reset anchor refs after a failed voice activation.
resetAnchor: () => void;
handleKeyEvent: (fallbackMs?: number) => void;
interimRange: InterimRange | null;
};
export function useVoiceIntegration({
setInputValueRaw,
inputValueRef,
insertTextRef,
}: UseVoiceIntegrationArgs): UseVoiceIntegrationResult {
const { addNotification } = useNotifications();
// Tracks the input content before/after the cursor when voice starts,
// so interim transcripts can be inserted at the cursor position without
// clobbering surrounding user text.
const voicePrefixRef = useRef<string | null>(null);
const voiceSuffixRef = useRef<string>('');
// Tracks the last input value this hook wrote (via anchor, interim effect,
// or handleVoiceTranscript). If inputValueRef.current diverges, the user
// submitted or edited — both write paths bail to avoid clobbering. This is
// the only guard that correctly handles empty-prefix-empty-suffix: a
// startsWith('')/endsWith('') check vacuously passes, and a length check
// can't distinguish a cleared input from a never-set one.
const lastSetInputRef = useRef<string | null>(null);
// Strip trailing hold-key chars (and optionally capture the voice
// anchor). Called during warmup (to clean up chars that leaked past
// stopImmediatePropagation — listener order is not guaranteed) and
// on activation (with anchor=true to capture the prefix/suffix around
// the cursor for interim transcript placement). The caller passes the
// exact count it expects to strip so pre-existing chars at the
// boundary are preserved (e.g. the "v" in "hav" when hold-key is "v").
// The floor option sets a minimum trailing count to leave behind
// (during warmup this is the count we intentionally let through, so
// defensive cleanup only removes leaks). Returns the number of
// trailing chars remaining after stripping. When nothing changes, no
// state update is performed.
const stripTrailing = useCallback(
(maxStrip: number, { char = ' ', anchor = false, floor = 0 }: StripOpts = {}) => {
const prev = inputValueRef.current;
const offset = insertTextRef.current?.cursorOffset ?? prev.length;
const beforeCursor = prev.slice(0, offset);
const afterCursor = prev.slice(offset);
// When the hold key is space, also count full-width spaces (U+3000)
// that a CJK IME may have inserted for the same physical key.
// U+3000 is BMP single-code-unit so indices align with beforeCursor.
const scan = char === ' ' ? normalizeFullWidthSpace(beforeCursor) : beforeCursor;
let trailing = 0;
while (trailing < scan.length && scan[scan.length - 1 - trailing] === char) {
trailing++;
}
const stripCount = Math.max(0, Math.min(trailing - floor, maxStrip));
const remaining = trailing - stripCount;
const stripped = beforeCursor.slice(0, beforeCursor.length - stripCount);
// When anchoring with a non-space suffix, insert a gap space so the
// waveform cursor sits on the gap instead of covering the first
// suffix letter. The interim transcript effect maintains this same
// structure (prefix + leading + interim + trailing + suffix), so
// the gap is seamless once transcript text arrives.
// Always overwrite on anchor — if a prior activation failed to start
// voice (voiceState stayed 'idle'), the cleanup effect didn't fire and
// the old anchor is stale. anchor=true is only passed on the single
// activation call, never during recording, so overwrite is safe.
let gap = '';
if (anchor) {
voicePrefixRef.current = stripped;
voiceSuffixRef.current = afterCursor;
if (afterCursor.length > 0 && !/^\s/.test(afterCursor)) {
gap = ' ';
}
}
const newValue = stripped + gap + afterCursor;
if (anchor) lastSetInputRef.current = newValue;
if (newValue === prev && stripCount === 0) return remaining;
if (insertTextRef.current) {
insertTextRef.current.setInputWithCursor(newValue, stripped.length);
} else {
setInputValueRaw(newValue);
}
return remaining;
},
[setInputValueRaw, inputValueRef, insertTextRef],
);
// Undo the gap space inserted by stripTrailing(..., {anchor:true}) and
// reset the voice prefix/suffix refs. Called when voice activation fails
// (voiceState stays 'idle' after voiceHandleKeyEvent), so the cleanup
// effect (voiceState useEffect below) — which only fires on voiceState transitions — can't
// reach the stale anchor. Without this, the gap space and stale refs
// persist in the input.
const resetAnchor = useCallback(() => {
const prefix = voicePrefixRef.current;
if (prefix === null) return;
const suffix = voiceSuffixRef.current;
voicePrefixRef.current = null;
voiceSuffixRef.current = '';
const restored = prefix + suffix;
if (insertTextRef.current) {
insertTextRef.current.setInputWithCursor(restored, prefix.length);
} else {
setInputValueRaw(restored);
}
}, [setInputValueRaw, insertTextRef]);
// Voice state selectors. useVoiceEnabled = user intent (settings) +
// auth + GB kill-switch, with the auth half memoized on authVersion so
// render loops never hit a cold keychain spawn.
const voiceEnabledRaw = useVoiceEnabled();
const voiceEnabled = feature('VOICE_MODE') ? voiceEnabledRaw : false;
const voiceStateRaw = useVoiceState(s => s.voiceState);
const voiceState = feature('VOICE_MODE') ? voiceStateRaw : ('idle' as const);
const voiceInterimTranscriptRaw = useVoiceState(s => s.voiceInterimTranscript);
const voiceInterimTranscript = feature('VOICE_MODE') ? voiceInterimTranscriptRaw : '';
// Set the voice anchor for focus mode (where recording starts via terminal
// focus, not key hold). Key-hold sets the anchor in stripTrailing.
useEffect(() => {
if (!feature('VOICE_MODE')) return;
if (voiceState === 'recording' && voicePrefixRef.current === null) {
const input = inputValueRef.current;
const offset = insertTextRef.current?.cursorOffset ?? input.length;
voicePrefixRef.current = input.slice(0, offset);
voiceSuffixRef.current = input.slice(offset);
lastSetInputRef.current = input;
}
if (voiceState === 'idle') {
voicePrefixRef.current = null;
voiceSuffixRef.current = '';
lastSetInputRef.current = null;
}
}, [voiceState, inputValueRef, insertTextRef]);
// Live-update the prompt input with the interim transcript as voice
// transcribes speech. The prefix (user-typed text before the cursor) is
// preserved and the transcript is inserted between prefix and suffix.
useEffect(() => {
if (!feature('VOICE_MODE')) return;
if (voicePrefixRef.current === null) return;
const prefix = voicePrefixRef.current;
const suffix = voiceSuffixRef.current;
// Submit race: if the input isn't what this hook last set it to, the
// user submitted (clearing it) or edited it. voicePrefixRef is only
// cleared on voiceState→idle, so it's still set during the 'processing'
// window between CloseStream and WS close — this catches refined
// TranscriptText arriving then and re-filling a cleared input.
if (inputValueRef.current !== lastSetInputRef.current) return;
const needsSpace = prefix.length > 0 && !/\s$/.test(prefix) && voiceInterimTranscript.length > 0;
// Don't gate on voiceInterimTranscript.length -- when interim clears to ''
// after handleVoiceTranscript sets the final text, the trailing space
// between prefix and suffix must still be preserved.
const needsTrailingSpace = suffix.length > 0 && !/^\s/.test(suffix);
const leadingSpace = needsSpace ? ' ' : '';
const trailingSpace = needsTrailingSpace ? ' ' : '';
const newValue = prefix + leadingSpace + voiceInterimTranscript + trailingSpace + suffix;
// Position cursor after the transcribed text (before suffix)
const cursorPos = prefix.length + leadingSpace.length + voiceInterimTranscript.length;
if (insertTextRef.current) {
insertTextRef.current.setInputWithCursor(newValue, cursorPos);
} else {
setInputValueRaw(newValue);
}
lastSetInputRef.current = newValue;
}, [voiceInterimTranscript, setInputValueRaw, inputValueRef, insertTextRef]);
const handleVoiceTranscript = useCallback(
(text: string) => {
if (!feature('VOICE_MODE')) return;
const prefix = voicePrefixRef.current;
// No voice anchor — voice was reset (or never started). Nothing to do.
if (prefix === null) return;
const suffix = voiceSuffixRef.current;
// Submit race: finishRecording() → user presses Enter (input cleared)
// → WebSocket close → this callback fires with stale prefix/suffix.
// If the input isn't what this hook last set (via the interim effect
// or anchor), the user submitted or edited — don't re-fill. Comparing
// against `text.length` would false-positive when the final is longer
// than the interim (ASR routinely adds punctuation/corrections).
if (inputValueRef.current !== lastSetInputRef.current) return;
const needsSpace = prefix.length > 0 && !/\s$/.test(prefix) && text.length > 0;
const needsTrailingSpace = suffix.length > 0 && !/^\s/.test(suffix) && text.length > 0;
const leadingSpace = needsSpace ? ' ' : '';
const trailingSpace = needsTrailingSpace ? ' ' : '';
const newInput = prefix + leadingSpace + text + trailingSpace + suffix;
// Position cursor after the transcribed text (before suffix)
const cursorPos = prefix.length + leadingSpace.length + text.length;
if (insertTextRef.current) {
insertTextRef.current.setInputWithCursor(newInput, cursorPos);
} else {
setInputValueRaw(newInput);
}
lastSetInputRef.current = newInput;
// Update the prefix to include this chunk so focus mode can continue
// appending subsequent transcripts after it.
voicePrefixRef.current = prefix + leadingSpace + text;
},
[setInputValueRaw, inputValueRef, insertTextRef],
);
const voice = voiceNs.useVoice({
onTranscript: handleVoiceTranscript,
onError: (message: string) => {
addNotification({
key: 'voice-error',
text: message,
color: 'error',
priority: 'immediate',
timeoutMs: 10_000,
});
},
enabled: voiceEnabled,
focusMode: false,
});
// Compute the character range of interim (not-yet-finalized) transcript
// text in the input value, so the UI can dim it.
const interimRange = useMemo((): InterimRange | null => {
if (!feature('VOICE_MODE')) return null;
if (voicePrefixRef.current === null) return null;
if (voiceInterimTranscript.length === 0) return null;
const prefix = voicePrefixRef.current;
const needsSpace = prefix.length > 0 && !/\s$/.test(prefix) && voiceInterimTranscript.length > 0;
const start = prefix.length + (needsSpace ? 1 : 0);
const end = start + voiceInterimTranscript.length;
return { start, end };
}, [voiceInterimTranscript]);
return {
stripTrailing,
resetAnchor,
handleKeyEvent: voice.handleKeyEvent,
interimRange,
};
}
/**
* Component that handles hold-to-talk voice activation.
*
* The activation key is configurable via keybindings (voice:pushToTalk,
* default: space). Hold detection depends on OS auto-repeat delivering a
* stream of events at 30-80ms intervals. Two binding types work:
*
* **Modifier + letter (meta+k, ctrl+x, alt+v):** Cleanest. Activates on
* the first press — a modifier combo is unambiguous intent (can't be
* typed accidentally), so no hold threshold applies. The letter part
* auto-repeats while held, feeding release detection in useVoice.ts.
* No flow-through, no stripping.
*
* **Bare chars (space, v, x):** Require HOLD_THRESHOLD rapid presses to
* activate (a single space could be normal typing). The first
* WARMUP_THRESHOLD presses flow into the input so a single press types
* normally. Past that, rapid presses are swallowed; on activation the
* flow-through chars are stripped. Binding "v" doesn't make "v"
* untypable — normal typing (>120ms between keystrokes) flows through;
* only rapid auto-repeat from a held key triggers activation.
*
* Known broken: modifier+space (NUL → parsed as ctrl+backtick), chords
* (discrete sequences, no hold). Validation warns on these.
*/
export function useVoiceKeybindingHandler({
voiceHandleKeyEvent,
stripTrailing,
resetAnchor,
isActive,
}: {
voiceHandleKeyEvent: (fallbackMs?: number) => void;
stripTrailing: (maxStrip: number, opts?: StripOpts) => number;
resetAnchor: () => void;
isActive: boolean;
}): { handleKeyDown: (e: KeyboardEvent) => void } {
const getVoiceState = useGetVoiceState();
const setVoiceState = useSetVoiceState();
const keybindingContext = useOptionalKeybindingContext();
const isModalOverlayActive = useIsModalOverlayActive();
const voiceEnabledRaw = useVoiceEnabled();
const voiceEnabled = feature('VOICE_MODE') ? voiceEnabledRaw : false;
const voiceStateRaw = useVoiceState(s => s.voiceState);
const voiceState = feature('VOICE_MODE') ? voiceStateRaw : 'idle';
// Find the configured key for voice:pushToTalk from keybinding context.
// Forward iteration with last-wins (matching the resolver): if a later
// Chat binding overrides the same chord with null or a different
// action, the voice binding is discarded and null is returned — the
// user explicitly disabled hold-to-talk via binding override, so
// don't second-guess them with a fallback. The DEFAULT is only used
// when there's no provider at all. Context filter is required — space
// is also bound in Settings/Confirmation/Plugin (select:accept etc.);
// without the filter those would null out the default.
const voiceKeystroke = useMemo((): ParsedKeystroke | null => {
if (!keybindingContext) return DEFAULT_VOICE_KEYSTROKE;
let result: ParsedKeystroke | null = null;
for (const binding of keybindingContext.bindings) {
if (binding.context !== 'Chat') continue;
if (binding.chord.length !== 1) continue;
const ks = binding.chord[0];
if (!ks) continue;
if (binding.action === 'voice:pushToTalk') {
result = ks;
} else if (result !== null && keystrokesEqual(ks, result)) {
// A later binding overrides this chord (null unbind or reassignment)
result = null;
}
}
return result;
}, [keybindingContext]);
// If the binding is a bare (unmodified) single printable char, terminal
// auto-repeat may batch N keystrokes into one input event (e.g. "vvv"),
// and the char flows into the text input — we need flow-through + strip.
// Modifier combos (meta+k, ctrl+x) also auto-repeat (the letter part
// repeats) but don't insert text, so they're swallowed from the first
// press with no stripping needed. matchesKeyboardEvent handles those.
const bareChar =
voiceKeystroke !== null &&
voiceKeystroke.key.length === 1 &&
!voiceKeystroke.ctrl &&
!voiceKeystroke.alt &&
!voiceKeystroke.shift &&
!voiceKeystroke.meta &&
!voiceKeystroke.super
? voiceKeystroke.key
: null;
const rapidCountRef = useRef(0);
// How many rapid chars we intentionally let through to the text
// input (the first WARMUP_THRESHOLD). The activation strip removes
// up to this many + the activation event's potential leak. For the
// default (space) this is precise — pre-existing trailing spaces are
// rare. For letter bindings (validation warns) this may over-strip
// one pre-existing char if the input already ended in the bound
// letter (e.g. "hav" + hold "v" → "ha"). We don't track that
// boundary — it's best-effort and the warning says so.
const charsInInputRef = useRef(0);
// Trailing-char count remaining after the activation strip — these
// belong to the user's anchored prefix and must be preserved during
// recording's defensive leak cleanup.
const recordingFloorRef = useRef(0);
// True when the current recording was started by key-hold (not focus).
// Used to avoid swallowing keypresses during focus-mode recording.
const isHoldActiveRef = useRef(false);
const resetTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
// Reset hold state as soon as we leave 'recording'. The physical hold
// ends when key-repeat stops (state → 'processing'); keeping the ref
// set through 'processing' swallows new space presses the user types
// while the transcript finalizes.
useEffect(() => {
if (voiceState !== 'recording') {
isHoldActiveRef.current = false;
rapidCountRef.current = 0;
charsInInputRef.current = 0;
recordingFloorRef.current = 0;
setVoiceState(prev => {
if (!prev.voiceWarmingUp) return prev;
return { ...prev, voiceWarmingUp: false };
});
}
}, [voiceState, setVoiceState]);
const handleKeyDown = (e: KeyboardEvent): void => {
if (!voiceEnabled) return;
// PromptInput is not a valid transcript target — let the hold key
// flow through instead of swallowing it into stale refs (#33556).
// Two distinct unmount/unfocus paths (both needed):
// - !isActive: local-jsx command hid PromptInput (shouldHidePromptInput)
// without registering an overlay — e.g. /install-github-app,
// /plugin. Mirrors CommandKeybindingHandlers' isActive gate.
// - isModalOverlayActive: overlay (permission dialog, Select with
// onCancel) has focus; PromptInput is mounted but focus=false.
if (!isActive || isModalOverlayActive) return;
// null means the user overrode the default (null-unbind/reassign) —
// hold-to-talk is disabled via binding. To toggle the feature
// itself, use /voice.
if (voiceKeystroke === null) return;
// Match the configured key. Bare chars match by content (handles
// batched auto-repeat like "vvv") with a modifier reject so e.g.
// ctrl+v doesn't trip a "v" binding. Modifier combos go through
// matchesKeyboardEvent (one event per repeat, no batching).
let repeatCount: number;
if (bareChar !== null) {
if (e.ctrl || e.meta || e.shift) return;
// When bound to space, also accept U+3000 (full-width space) —
// CJK IMEs emit it for the same physical key.
const normalized = bareChar === ' ' ? normalizeFullWidthSpace(e.key) : e.key;
// Fast-path: normal typing (any char that isn't the bound one)
// bails here without allocating. The repeat() check only matters
// for batched auto-repeat (input.length > 1) which is rare.
if (normalized[0] !== bareChar) return;
if (normalized.length > 1 && normalized !== bareChar.repeat(normalized.length)) return;
repeatCount = normalized.length;
} else {
if (!matchesKeyboardEvent(e, voiceKeystroke)) return;
repeatCount = 1;
}
// Guard: only swallow keypresses when recording was triggered by
// key-hold. Focus-mode recording also sets voiceState to 'recording',
// but keypresses should flow through normally (voiceHandleKeyEvent
// returns early for focus-triggered sessions). We also check voiceState
// from the store so that if voiceHandleKeyEvent() fails to transition
// state (module not loaded, stream unavailable) we don't permanently
// swallow keypresses.
const currentVoiceState = getVoiceState().voiceState;
if (isHoldActiveRef.current && currentVoiceState !== 'idle') {
// Already recording — swallow continued keypresses and forward
// to voice for release detection. For bare chars, defensively
// strip in case the text input handler fired before this one
// (listener order is not guaranteed). Modifier combos don't
// insert text, so nothing to strip.
e.stopImmediatePropagation();
if (bareChar !== null) {
stripTrailing(repeatCount, {
char: bareChar,
floor: recordingFloorRef.current,
});
}
voiceHandleKeyEvent();
return;
}
// Non-hold recording (focus-mode) or processing is active.
// Modifier combos must not re-activate: stripTrailing(0,{anchor:true})
// would overwrite voicePrefixRef with interim text and duplicate the
// transcript on the next interim update. Pre-#22144, a single tap
// hit the warmup else-branch (swallow only). Bare chars flow through
// unconditionally — user may be typing during focus-recording.
if (currentVoiceState !== 'idle') {
if (bareChar === null) e.stopImmediatePropagation();
return;
}
const countBefore = rapidCountRef.current;
rapidCountRef.current += repeatCount;
// ── Activation ────────────────────────────────────────────
// Handled first so the warmup branch below does NOT also run
// on this event — two strip calls in the same tick would both
// read the stale inputValueRef and the second would under-strip.
// Modifier combos activate on the first press — they can't be
// typed accidentally, so the hold threshold (which exists to
// distinguish typing a space from holding space) doesn't apply.
if (bareChar === null || rapidCountRef.current >= HOLD_THRESHOLD) {
e.stopImmediatePropagation();
if (resetTimerRef.current) {
clearTimeout(resetTimerRef.current);
resetTimerRef.current = null;
}
rapidCountRef.current = 0;
isHoldActiveRef.current = true;
setVoiceState(prev => {
if (!prev.voiceWarmingUp) return prev;
return { ...prev, voiceWarmingUp: false };
});
if (bareChar !== null) {
// Strip the intentional warmup chars plus this event's leak
// (if text input fired first). Cap covers both; min(trailing)
// handles the no-leak case. Anchor the voice prefix here.
// The return value (remaining) becomes the floor for
// recording-time leak cleanup.
recordingFloorRef.current = stripTrailing(charsInInputRef.current + repeatCount, {
char: bareChar,
anchor: true,
});
charsInInputRef.current = 0;
voiceHandleKeyEvent();
} else {
// Modifier combo: nothing inserted, nothing to strip. Just
// anchor the voice prefix at the current cursor position.
// Longer fallback: this call is at t=0 (before auto-repeat),
// so the gap to the next keypress is the OS initial repeat
// *delay* (up to ~2s), not the repeat *rate* (~30-80ms).
stripTrailing(0, { anchor: true });
voiceHandleKeyEvent(MODIFIER_FIRST_PRESS_FALLBACK_MS);
}
// If voice failed to transition (module not loaded, stream
// unavailable, stale enabled), clear the ref so a later
// focus-mode recording doesn't inherit stale hold state
// and swallow keypresses. Store is synchronous — the check is
// immediate. The anchor set by stripTrailing above will
// be overwritten on retry (anchor always overwrites now).
if (getVoiceState().voiceState === 'idle') {
isHoldActiveRef.current = false;
resetAnchor();
}
return;
}
// ── Warmup (bare-char only; modifier combos activated above) ──
// First WARMUP_THRESHOLD chars flow to the text input so normal
// typing has zero latency (a single press types normally).
// Subsequent rapid chars are swallowed so the input stays aligned
// with the warmup UI. Strip defensively (listener order is not
// guaranteed — text input may have already added the char). The
// floor preserves the intentional warmup chars; the strip is a
// no-op when nothing leaked. Check countBefore so the event that
// crosses the threshold still flows through (terminal batching).
if (countBefore >= WARMUP_THRESHOLD) {
e.stopImmediatePropagation();
stripTrailing(repeatCount, {
char: bareChar,
floor: charsInInputRef.current,
});
} else {
charsInInputRef.current += repeatCount;
}
// Show warmup feedback once we detect a hold pattern
if (rapidCountRef.current >= WARMUP_THRESHOLD) {
setVoiceState(prev => {
if (prev.voiceWarmingUp) return prev;
return { ...prev, voiceWarmingUp: true };
});
}
if (resetTimerRef.current) {
clearTimeout(resetTimerRef.current);
}
resetTimerRef.current = setTimeout(
(resetTimerRef, rapidCountRef, charsInInputRef, setVoiceState) => {
resetTimerRef.current = null;
rapidCountRef.current = 0;
charsInInputRef.current = 0;
setVoiceState(prev => {
if (!prev.voiceWarmingUp) return prev;
return { ...prev, voiceWarmingUp: false };
});
},
RAPID_KEY_GAP_MS,
resetTimerRef,
rapidCountRef,
charsInInputRef,
setVoiceState,
);
};
// Backward-compat bridge: REPL.tsx doesn't yet wire handleKeyDown to
// <Box onKeyDown>. Subscribe via useInput and adapt InputEvent →
// KeyboardEvent until the consumer is migrated (separate PR).
// TODO(onKeyDown-migration): remove once REPL passes handleKeyDown.
useInput(
(_input, _key, event) => {
const kbEvent = new KeyboardEvent(event.keypress);
handleKeyDown(kbEvent);
// handleKeyDown stopped the adapter event, not the InputEvent the
// emitter actually checks — forward it so the text input's useInput
// listener is skipped and held spaces don't leak into the prompt.
if (kbEvent.didStopImmediatePropagation()) {
event.stopImmediatePropagation();
}
},
{ isActive },
);
return { handleKeyDown };
}
// TODO(onKeyDown-migration): temporary shim so existing JSX callers
// (<VoiceKeybindingHandler .../>) keep compiling. Remove once REPL.tsx
// wires handleKeyDown directly.
export function VoiceKeybindingHandler(props: {
voiceHandleKeyEvent: (fallbackMs?: number) => void;
stripTrailing: (maxStrip: number, opts?: StripOpts) => number;
resetAnchor: () => void;
isActive: boolean;
}): null {
useVoiceKeybindingHandler(props);
return null;
}