/** * Intent Normalization Layer for Skill Search * * Problem: TF-IDF bag-of-words loses meaning when the user query is in Chinese * and most skill descriptions are English. CJK bi-grams get DF=1 (language * mismatch, not true rarity), producing IDF values that promote spurious * matches like `prompt-optimizer` for `帮我优化代码的性能`. * * Fix: Before handing the query to `searchSkills()`, ask Haiku to normalize it * into 3-6 English task/object keywords. Concatenate the normalized form with * the original so TF-IDF sees both — English keywords carry real matching * signal, the original text stays as a fallback. * * Design: * - Turn-zero only (blocking on user input): one Haiku call per session-unique * query. Not called in inter-turn prefetch (which repeats per tool loop). * - Process-level cache: identical queries within a session reuse the result. * - Graceful fallback: Haiku failure / timeout / empty → return original query. * - ASCII-only fast path: queries without CJK characters skip the LLM entirely. * - Feature-flagged: `SKILL_SEARCH_INTENT_ENABLED=1` to opt in. */ import { queryHaiku } from '../api/claude.js' import { asSystemPrompt } from '../../utils/systemPromptType.js' import { logForDebugging } from '../../utils/debug.js' const INTENT_SYSTEM_PROMPT = `You are a query normalizer for a skill-search index. Given a user's natural-language request (often Chinese, possibly long), extract 3-6 English keywords that capture: 1. TASK VERB (optimize, review, debug, refactor, test, deploy, analyze, write, audit, design, research, cleanup, implement) 2. OBJECT (code, prompt, test, UI, API, database, documentation, performance, security, architecture) 3. CONTEXT/DOMAIN when clear (frontend, backend, mobile, python, go, rust, typescript) Output ONLY space-separated lowercase English keywords. No prose, no JSON, no punctuation, no code fences. Examples: - "帮我优化代码的性能" -> optimize code performance refactor - "研究当前代码的实现然后分析优化思路" -> analyze code research refactor architecture - "优化 prompt 的表达" -> optimize prompt refine writing - "帮我做 code review" -> code review audit - "清理代码里的 TODO" -> cleanup refactor dead-code - "重构这个模块的代码" -> refactor code modularize - "帮我写个 Go 单元测试" -> write test golang unit Output ONLY keywords. Nothing else.` const DEFAULT_TIMEOUT_MS = 6_000 const MAX_QUERY_CHARS = 500 const MAX_KEYWORDS_CHARS = 120 /** Process-level cache. Keyed by the original (trimmed) query. */ const cache = new Map() export function isIntentNormalizeEnabled(): boolean { return process.env.SKILL_SEARCH_INTENT_ENABLED === '1' } /** Only reset between tests. */ export function clearIntentNormalizeCache(): void { cache.clear() } /** * Normalize a user query so TF-IDF sees English task keywords. * Returns ` ` on success, or the original string on any * failure path. Never throws. */ export async function normalizeQueryIntent(query: string): Promise { const trimmed = query.trim() if (!trimmed) return trimmed if (!isIntentNormalizeEnabled()) return trimmed // ASCII-only queries are already in the right shape for the index. if (!/[\u4e00-\u9fff]/.test(trimmed)) return trimmed const cached = cache.get(trimmed) if (cached !== undefined) return cached const capped = trimmed.slice(0, MAX_QUERY_CHARS) const keywords = await callHaiku(capped) const result = keywords ? `${trimmed} ${keywords}` : trimmed cache.set(trimmed, result) logForDebugging( `[skill-search] intent normalized: "${trimmed.slice(0, 40)}" -> "${keywords}"`, ) return result } async function callHaiku(query: string): Promise { const timeoutMs = getTimeoutMs() const controller = new AbortController() const timer = setTimeout(() => controller.abort(), timeoutMs) try { const response = await queryHaiku({ systemPrompt: asSystemPrompt([INTENT_SYSTEM_PROMPT]), userPrompt: query, signal: controller.signal, options: { querySource: 'skill_search_intent', enablePromptCaching: true, agents: [], isNonInteractiveSession: true, hasAppendSystemPrompt: false, mcpTools: [], }, }) const text = extractResponseText(response?.message?.content) return sanitizeKeywords(text) } catch (error) { logForDebugging(`[skill-search] intent normalize failed: ${error}`) return '' } finally { clearTimeout(timer) } } function getTimeoutMs(): number { const raw = process.env.SKILL_SEARCH_INTENT_TIMEOUT_MS if (!raw) return DEFAULT_TIMEOUT_MS const parsed = Number(raw) if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_TIMEOUT_MS return parsed } function extractResponseText(content: unknown): string { if (!Array.isArray(content)) return '' const parts: string[] = [] for (const block of content) { if (!block || typeof block !== 'object') continue const record = block as Record if (record.type !== 'text') continue if (typeof record.text === 'string') parts.push(record.text) } return parts.join('').trim() } function sanitizeKeywords(raw: string): string { if (!raw) return '' // Strip anything that's not a keyword character. Keep ascii letters, digits, // hyphens, and spaces. Collapse whitespace. const cleaned = raw .toLowerCase() .replace(/[^a-z0-9\- ]+/g, ' ') .replace(/\s+/g, ' ') .trim() if (!cleaned) return '' return cleaned.slice(0, MAX_KEYWORDS_CHARS) }