mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-17 13:55:50 +00:00
150 lines
5.5 KiB
TypeScript
150 lines
5.5 KiB
TypeScript
/**
|
|
* Intent Normalization Layer for Skill Search
|
|
*
|
|
* Problem: TF-IDF bag-of-words loses meaning when the user query is in Chinese
|
|
* and most skill descriptions are English. CJK bi-grams get DF=1 (language
|
|
* mismatch, not true rarity), producing IDF values that promote spurious
|
|
* matches like `prompt-optimizer` for `帮我优化代码的性能`.
|
|
*
|
|
* Fix: Before handing the query to `searchSkills()`, ask Haiku to normalize it
|
|
* into 3-6 English task/object keywords. Concatenate the normalized form with
|
|
* the original so TF-IDF sees both — English keywords carry real matching
|
|
* signal, the original text stays as a fallback.
|
|
*
|
|
* Design:
|
|
* - Turn-zero only (blocking on user input): one Haiku call per session-unique
|
|
* query. Not called in inter-turn prefetch (which repeats per tool loop).
|
|
* - Process-level cache: identical queries within a session reuse the result.
|
|
* - Graceful fallback: Haiku failure / timeout / empty → return original query.
|
|
* - ASCII-only fast path: queries without CJK characters skip the LLM entirely.
|
|
* - Feature-flagged: `SKILL_SEARCH_INTENT_ENABLED=1` to opt in.
|
|
*/
|
|
|
|
import { queryHaiku } from '../api/claude.js'
|
|
import { asSystemPrompt } from '../../utils/systemPromptType.js'
|
|
import { logForDebugging } from '../../utils/debug.js'
|
|
|
|
const INTENT_SYSTEM_PROMPT = `You are a query normalizer for a skill-search index.
|
|
|
|
Given a user's natural-language request (often Chinese, possibly long), extract 3-6 English keywords that capture:
|
|
1. TASK VERB (optimize, review, debug, refactor, test, deploy, analyze, write, audit, design, research, cleanup, implement)
|
|
2. OBJECT (code, prompt, test, UI, API, database, documentation, performance, security, architecture)
|
|
3. CONTEXT/DOMAIN when clear (frontend, backend, mobile, python, go, rust, typescript)
|
|
|
|
Output ONLY space-separated lowercase English keywords. No prose, no JSON, no punctuation, no code fences.
|
|
|
|
Examples:
|
|
- "帮我优化代码的性能" -> optimize code performance refactor
|
|
- "研究当前代码的实现然后分析优化思路" -> analyze code research refactor architecture
|
|
- "优化 prompt 的表达" -> optimize prompt refine writing
|
|
- "帮我做 code review" -> code review audit
|
|
- "清理代码里的 TODO" -> cleanup refactor dead-code
|
|
- "重构这个模块的代码" -> refactor code modularize
|
|
- "帮我写个 Go 单元测试" -> write test golang unit
|
|
|
|
Output ONLY keywords. Nothing else.`
|
|
|
|
const DEFAULT_TIMEOUT_MS = 6_000
|
|
const MAX_QUERY_CHARS = 500
|
|
const MAX_KEYWORDS_CHARS = 120
|
|
|
|
/** Process-level cache. Keyed by the original (trimmed) query. */
|
|
const cache = new Map<string, string>()
|
|
|
|
export function isIntentNormalizeEnabled(): boolean {
|
|
return process.env.SKILL_SEARCH_INTENT_ENABLED === '1'
|
|
}
|
|
|
|
/** Only reset between tests. */
|
|
export function clearIntentNormalizeCache(): void {
|
|
cache.clear()
|
|
}
|
|
|
|
/**
|
|
* Normalize a user query so TF-IDF sees English task keywords.
|
|
* Returns `<original> <keywords>` on success, or the original string on any
|
|
* failure path. Never throws.
|
|
*/
|
|
export async function normalizeQueryIntent(query: string): Promise<string> {
|
|
const trimmed = query.trim()
|
|
if (!trimmed) return trimmed
|
|
if (!isIntentNormalizeEnabled()) return trimmed
|
|
|
|
// ASCII-only queries are already in the right shape for the index.
|
|
if (!/[\u4e00-\u9fff]/.test(trimmed)) return trimmed
|
|
|
|
const cached = cache.get(trimmed)
|
|
if (cached !== undefined) return cached
|
|
|
|
const capped = trimmed.slice(0, MAX_QUERY_CHARS)
|
|
const keywords = await callHaiku(capped)
|
|
const result = keywords ? `${trimmed} ${keywords}` : trimmed
|
|
cache.set(trimmed, result)
|
|
logForDebugging(
|
|
`[skill-search] intent normalized: "${trimmed.slice(0, 40)}" -> "${keywords}"`,
|
|
)
|
|
return result
|
|
}
|
|
|
|
async function callHaiku(query: string): Promise<string> {
|
|
const timeoutMs = getTimeoutMs()
|
|
const controller = new AbortController()
|
|
const timer = setTimeout(() => controller.abort(), timeoutMs)
|
|
|
|
try {
|
|
const response = await queryHaiku({
|
|
systemPrompt: asSystemPrompt([INTENT_SYSTEM_PROMPT]),
|
|
userPrompt: query,
|
|
signal: controller.signal,
|
|
options: {
|
|
querySource: 'skill_search_intent',
|
|
enablePromptCaching: true,
|
|
agents: [],
|
|
isNonInteractiveSession: true,
|
|
hasAppendSystemPrompt: false,
|
|
mcpTools: [],
|
|
},
|
|
})
|
|
const text = extractResponseText(response?.message?.content)
|
|
return sanitizeKeywords(text)
|
|
} catch (error) {
|
|
logForDebugging(`[skill-search] intent normalize failed: ${error}`)
|
|
return ''
|
|
} finally {
|
|
clearTimeout(timer)
|
|
}
|
|
}
|
|
|
|
function getTimeoutMs(): number {
|
|
const raw = process.env.SKILL_SEARCH_INTENT_TIMEOUT_MS
|
|
if (!raw) return DEFAULT_TIMEOUT_MS
|
|
const parsed = Number(raw)
|
|
if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_TIMEOUT_MS
|
|
return parsed
|
|
}
|
|
|
|
function extractResponseText(content: unknown): string {
|
|
if (!Array.isArray(content)) return ''
|
|
const parts: string[] = []
|
|
for (const block of content) {
|
|
if (!block || typeof block !== 'object') continue
|
|
const record = block as Record<string, unknown>
|
|
if (record.type !== 'text') continue
|
|
if (typeof record.text === 'string') parts.push(record.text)
|
|
}
|
|
return parts.join('').trim()
|
|
}
|
|
|
|
function sanitizeKeywords(raw: string): string {
|
|
if (!raw) return ''
|
|
// Strip anything that's not a keyword character. Keep ascii letters, digits,
|
|
// hyphens, and spaces. Collapse whitespace.
|
|
const cleaned = raw
|
|
.toLowerCase()
|
|
.replace(/[^a-z0-9\- ]+/g, ' ')
|
|
.replace(/\s+/g, ' ')
|
|
.trim()
|
|
if (!cleaned) return ''
|
|
return cleaned.slice(0, MAX_KEYWORDS_CHARS)
|
|
}
|