Files
claude-code/src/services/skillSearch/intentNormalize.ts
2026-04-22 22:38:09 +08:00

150 lines
5.5 KiB
TypeScript

/**
* Intent Normalization Layer for Skill Search
*
* Problem: TF-IDF bag-of-words loses meaning when the user query is in Chinese
* and most skill descriptions are English. CJK bi-grams get DF=1 (language
* mismatch, not true rarity), producing IDF values that promote spurious
* matches like `prompt-optimizer` for `帮我优化代码的性能`.
*
* Fix: Before handing the query to `searchSkills()`, ask Haiku to normalize it
* into 3-6 English task/object keywords. Concatenate the normalized form with
* the original so TF-IDF sees both — English keywords carry real matching
* signal, the original text stays as a fallback.
*
* Design:
* - Turn-zero only (blocking on user input): one Haiku call per session-unique
* query. Not called in inter-turn prefetch (which repeats per tool loop).
* - Process-level cache: identical queries within a session reuse the result.
* - Graceful fallback: Haiku failure / timeout / empty → return original query.
* - ASCII-only fast path: queries without CJK characters skip the LLM entirely.
* - Feature-flagged: `SKILL_SEARCH_INTENT_ENABLED=1` to opt in.
*/
import { queryHaiku } from '../api/claude.js'
import { asSystemPrompt } from '../../utils/systemPromptType.js'
import { logForDebugging } from '../../utils/debug.js'
const INTENT_SYSTEM_PROMPT = `You are a query normalizer for a skill-search index.
Given a user's natural-language request (often Chinese, possibly long), extract 3-6 English keywords that capture:
1. TASK VERB (optimize, review, debug, refactor, test, deploy, analyze, write, audit, design, research, cleanup, implement)
2. OBJECT (code, prompt, test, UI, API, database, documentation, performance, security, architecture)
3. CONTEXT/DOMAIN when clear (frontend, backend, mobile, python, go, rust, typescript)
Output ONLY space-separated lowercase English keywords. No prose, no JSON, no punctuation, no code fences.
Examples:
- "帮我优化代码的性能" -> optimize code performance refactor
- "研究当前代码的实现然后分析优化思路" -> analyze code research refactor architecture
- "优化 prompt 的表达" -> optimize prompt refine writing
- "帮我做 code review" -> code review audit
- "清理代码里的 TODO" -> cleanup refactor dead-code
- "重构这个模块的代码" -> refactor code modularize
- "帮我写个 Go 单元测试" -> write test golang unit
Output ONLY keywords. Nothing else.`
const DEFAULT_TIMEOUT_MS = 6_000
const MAX_QUERY_CHARS = 500
const MAX_KEYWORDS_CHARS = 120
/** Process-level cache. Keyed by the original (trimmed) query. */
const cache = new Map<string, string>()
export function isIntentNormalizeEnabled(): boolean {
return process.env.SKILL_SEARCH_INTENT_ENABLED === '1'
}
/** Only reset between tests. */
export function clearIntentNormalizeCache(): void {
cache.clear()
}
/**
* Normalize a user query so TF-IDF sees English task keywords.
* Returns `<original> <keywords>` on success, or the original string on any
* failure path. Never throws.
*/
export async function normalizeQueryIntent(query: string): Promise<string> {
const trimmed = query.trim()
if (!trimmed) return trimmed
if (!isIntentNormalizeEnabled()) return trimmed
// ASCII-only queries are already in the right shape for the index.
if (!/[\u4e00-\u9fff]/.test(trimmed)) return trimmed
const cached = cache.get(trimmed)
if (cached !== undefined) return cached
const capped = trimmed.slice(0, MAX_QUERY_CHARS)
const keywords = await callHaiku(capped)
const result = keywords ? `${trimmed} ${keywords}` : trimmed
cache.set(trimmed, result)
logForDebugging(
`[skill-search] intent normalized: "${trimmed.slice(0, 40)}" -> "${keywords}"`,
)
return result
}
async function callHaiku(query: string): Promise<string> {
const timeoutMs = getTimeoutMs()
const controller = new AbortController()
const timer = setTimeout(() => controller.abort(), timeoutMs)
try {
const response = await queryHaiku({
systemPrompt: asSystemPrompt([INTENT_SYSTEM_PROMPT]),
userPrompt: query,
signal: controller.signal,
options: {
querySource: 'skill_search_intent',
enablePromptCaching: true,
agents: [],
isNonInteractiveSession: true,
hasAppendSystemPrompt: false,
mcpTools: [],
},
})
const text = extractResponseText(response?.message?.content)
return sanitizeKeywords(text)
} catch (error) {
logForDebugging(`[skill-search] intent normalize failed: ${error}`)
return ''
} finally {
clearTimeout(timer)
}
}
function getTimeoutMs(): number {
const raw = process.env.SKILL_SEARCH_INTENT_TIMEOUT_MS
if (!raw) return DEFAULT_TIMEOUT_MS
const parsed = Number(raw)
if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_TIMEOUT_MS
return parsed
}
function extractResponseText(content: unknown): string {
if (!Array.isArray(content)) return ''
const parts: string[] = []
for (const block of content) {
if (!block || typeof block !== 'object') continue
const record = block as Record<string, unknown>
if (record.type !== 'text') continue
if (typeof record.text === 'string') parts.push(record.text)
}
return parts.join('').trim()
}
function sanitizeKeywords(raw: string): string {
if (!raw) return ''
// Strip anything that's not a keyword character. Keep ascii letters, digits,
// hyphens, and spaces. Collapse whitespace.
const cleaned = raw
.toLowerCase()
.replace(/[^a-z0-9\- ]+/g, ' ')
.replace(/\s+/g, ' ')
.trim()
if (!cleaned) return ''
return cleaned.slice(0, MAX_KEYWORDS_CHARS)
}