mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-18 22:35:51 +00:00
feat: 添加 skill learning 技能学习闭环系统
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
149
src/services/skillSearch/intentNormalize.ts
Normal file
149
src/services/skillSearch/intentNormalize.ts
Normal file
@@ -0,0 +1,149 @@
|
||||
/**
|
||||
* Intent Normalization Layer for Skill Search
|
||||
*
|
||||
* Problem: TF-IDF bag-of-words loses meaning when the user query is in Chinese
|
||||
* and most skill descriptions are English. CJK bi-grams get DF=1 (language
|
||||
* mismatch, not true rarity), producing IDF values that promote spurious
|
||||
* matches like `prompt-optimizer` for `帮我优化代码的性能`.
|
||||
*
|
||||
* Fix: Before handing the query to `searchSkills()`, ask Haiku to normalize it
|
||||
* into 3-6 English task/object keywords. Concatenate the normalized form with
|
||||
* the original so TF-IDF sees both — English keywords carry real matching
|
||||
* signal, the original text stays as a fallback.
|
||||
*
|
||||
* Design:
|
||||
* - Turn-zero only (blocking on user input): one Haiku call per session-unique
|
||||
* query. Not called in inter-turn prefetch (which repeats per tool loop).
|
||||
* - Process-level cache: identical queries within a session reuse the result.
|
||||
* - Graceful fallback: Haiku failure / timeout / empty → return original query.
|
||||
* - ASCII-only fast path: queries without CJK characters skip the LLM entirely.
|
||||
* - Feature-flagged: `SKILL_SEARCH_INTENT_ENABLED=1` to opt in.
|
||||
*/
|
||||
|
||||
import { queryHaiku } from '../api/claude.js'
|
||||
import { asSystemPrompt } from '../../utils/systemPromptType.js'
|
||||
import { logForDebugging } from '../../utils/debug.js'
|
||||
|
||||
const INTENT_SYSTEM_PROMPT = `You are a query normalizer for a skill-search index.
|
||||
|
||||
Given a user's natural-language request (often Chinese, possibly long), extract 3-6 English keywords that capture:
|
||||
1. TASK VERB (optimize, review, debug, refactor, test, deploy, analyze, write, audit, design, research, cleanup, implement)
|
||||
2. OBJECT (code, prompt, test, UI, API, database, documentation, performance, security, architecture)
|
||||
3. CONTEXT/DOMAIN when clear (frontend, backend, mobile, python, go, rust, typescript)
|
||||
|
||||
Output ONLY space-separated lowercase English keywords. No prose, no JSON, no punctuation, no code fences.
|
||||
|
||||
Examples:
|
||||
- "帮我优化代码的性能" -> optimize code performance refactor
|
||||
- "研究当前代码的实现然后分析优化思路" -> analyze code research refactor architecture
|
||||
- "优化 prompt 的表达" -> optimize prompt refine writing
|
||||
- "帮我做 code review" -> code review audit
|
||||
- "清理代码里的 TODO" -> cleanup refactor dead-code
|
||||
- "重构这个模块的代码" -> refactor code modularize
|
||||
- "帮我写个 Go 单元测试" -> write test golang unit
|
||||
|
||||
Output ONLY keywords. Nothing else.`
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 6_000
|
||||
const MAX_QUERY_CHARS = 500
|
||||
const MAX_KEYWORDS_CHARS = 120
|
||||
|
||||
/** Process-level cache. Keyed by the original (trimmed) query. */
|
||||
const cache = new Map<string, string>()
|
||||
|
||||
export function isIntentNormalizeEnabled(): boolean {
|
||||
return process.env.SKILL_SEARCH_INTENT_ENABLED === '1'
|
||||
}
|
||||
|
||||
/** Only reset between tests. */
|
||||
export function clearIntentNormalizeCache(): void {
|
||||
cache.clear()
|
||||
}
|
||||
|
||||
/**
|
||||
* Normalize a user query so TF-IDF sees English task keywords.
|
||||
* Returns `<original> <keywords>` on success, or the original string on any
|
||||
* failure path. Never throws.
|
||||
*/
|
||||
export async function normalizeQueryIntent(query: string): Promise<string> {
|
||||
const trimmed = query.trim()
|
||||
if (!trimmed) return trimmed
|
||||
if (!isIntentNormalizeEnabled()) return trimmed
|
||||
|
||||
// ASCII-only queries are already in the right shape for the index.
|
||||
if (!/[\u4e00-\u9fff]/.test(trimmed)) return trimmed
|
||||
|
||||
const cached = cache.get(trimmed)
|
||||
if (cached !== undefined) return cached
|
||||
|
||||
const capped = trimmed.slice(0, MAX_QUERY_CHARS)
|
||||
const keywords = await callHaiku(capped)
|
||||
const result = keywords ? `${trimmed} ${keywords}` : trimmed
|
||||
cache.set(trimmed, result)
|
||||
logForDebugging(
|
||||
`[skill-search] intent normalized: "${trimmed.slice(0, 40)}" -> "${keywords}"`,
|
||||
)
|
||||
return result
|
||||
}
|
||||
|
||||
async function callHaiku(query: string): Promise<string> {
|
||||
const timeoutMs = getTimeoutMs()
|
||||
const controller = new AbortController()
|
||||
const timer = setTimeout(() => controller.abort(), timeoutMs)
|
||||
|
||||
try {
|
||||
const response = await queryHaiku({
|
||||
systemPrompt: asSystemPrompt([INTENT_SYSTEM_PROMPT]),
|
||||
userPrompt: query,
|
||||
signal: controller.signal,
|
||||
options: {
|
||||
querySource: 'skill_search_intent',
|
||||
enablePromptCaching: true,
|
||||
agents: [],
|
||||
isNonInteractiveSession: true,
|
||||
hasAppendSystemPrompt: false,
|
||||
mcpTools: [],
|
||||
},
|
||||
})
|
||||
const text = extractResponseText(response?.message?.content)
|
||||
return sanitizeKeywords(text)
|
||||
} catch (error) {
|
||||
logForDebugging(`[skill-search] intent normalize failed: ${error}`)
|
||||
return ''
|
||||
} finally {
|
||||
clearTimeout(timer)
|
||||
}
|
||||
}
|
||||
|
||||
function getTimeoutMs(): number {
|
||||
const raw = process.env.SKILL_SEARCH_INTENT_TIMEOUT_MS
|
||||
if (!raw) return DEFAULT_TIMEOUT_MS
|
||||
const parsed = Number(raw)
|
||||
if (!Number.isFinite(parsed) || parsed <= 0) return DEFAULT_TIMEOUT_MS
|
||||
return parsed
|
||||
}
|
||||
|
||||
function extractResponseText(content: unknown): string {
|
||||
if (!Array.isArray(content)) return ''
|
||||
const parts: string[] = []
|
||||
for (const block of content) {
|
||||
if (!block || typeof block !== 'object') continue
|
||||
const record = block as Record<string, unknown>
|
||||
if (record.type !== 'text') continue
|
||||
if (typeof record.text === 'string') parts.push(record.text)
|
||||
}
|
||||
return parts.join('').trim()
|
||||
}
|
||||
|
||||
function sanitizeKeywords(raw: string): string {
|
||||
if (!raw) return ''
|
||||
// Strip anything that's not a keyword character. Keep ascii letters, digits,
|
||||
// hyphens, and spaces. Collapse whitespace.
|
||||
const cleaned = raw
|
||||
.toLowerCase()
|
||||
.replace(/[^a-z0-9\- ]+/g, ' ')
|
||||
.replace(/\s+/g, ' ')
|
||||
.trim()
|
||||
if (!cleaned) return ''
|
||||
return cleaned.slice(0, MAX_KEYWORDS_CHARS)
|
||||
}
|
||||
Reference in New Issue
Block a user