Files
claude-code/src/services/skillSearch/localSearch.ts
claude-code-best fcbc882232 chore: 清理 src 下 113 项未使用导入和死代码
删除未使用的文件(BuiltinStatusLine.tsx、4 个重复的 .ts stub)、
移除约 55 个文件中未使用的 React 导入、
清理约 50 处未使用的导入/变量/参数。
净减少 ~296 行代码,precheck 4077 测试全部通过。

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-05 20:05:15 +08:00

444 lines
10 KiB
TypeScript

import { logForDebugging } from '../../utils/debug.js'
export interface SkillIndexEntry {
name: string
normalizedName: string
description: string
whenToUse: string | undefined
source: string
loadedFrom: string | undefined
skillRoot: string | undefined
contentLength: number | undefined
tokens: string[]
tfVector: Map<string, number>
}
export interface SearchResult {
name: string
description: string
score: number
shortId?: string
source?: string
loadedFrom?: string
skillRoot?: string
contentLength?: number
}
const STOP_WORDS = new Set([
'a',
'an',
'the',
'is',
'are',
'was',
'were',
'be',
'been',
'being',
'have',
'has',
'had',
'do',
'does',
'did',
'will',
'would',
'could',
'should',
'may',
'might',
'shall',
'can',
'need',
'dare',
'ought',
'used',
'to',
'of',
'in',
'for',
'on',
'with',
'at',
'by',
'from',
'as',
'into',
'through',
'during',
'before',
'after',
'above',
'below',
'between',
'out',
'off',
'over',
'under',
'again',
'further',
'then',
'once',
'here',
'there',
'when',
'where',
'why',
'how',
'all',
'each',
'every',
'both',
'few',
'more',
'most',
'other',
'some',
'such',
'no',
'nor',
'not',
'only',
'own',
'same',
'so',
'than',
'too',
'very',
'just',
'because',
'but',
'and',
'or',
'if',
'while',
'this',
'that',
'these',
'those',
'it',
'its',
'i',
'me',
'my',
'we',
'our',
'you',
'your',
'he',
'him',
'his',
'she',
'her',
'they',
'them',
'their',
'what',
'which',
'who',
'whom',
'use',
'using',
'used',
])
const CJK_RANGE = /[\u4e00-\u9fff\u3400-\u4dbf]/
function isCjk(ch: string): boolean {
return CJK_RANGE.test(ch)
}
export function tokenize(text: string): string[] {
const tokens: string[] = []
const lower = text.toLowerCase()
let i = 0
while (i < lower.length) {
if (isCjk(lower[i]!)) {
let cjkRun = ''
while (i < lower.length && isCjk(lower[i]!)) {
cjkRun += lower[i]
i++
}
for (let j = 0; j < cjkRun.length - 1; j++) {
tokens.push(cjkRun.slice(j, j + 2))
}
} else if (/[a-z0-9]/.test(lower[i]!)) {
let word = ''
while (i < lower.length && /[a-z0-9\-_]/.test(lower[i]!)) {
word += lower[i]
i++
}
const cleaned = word.replace(/^[-_]+|[-_]+$/g, '')
if (cleaned && !STOP_WORDS.has(cleaned)) {
tokens.push(cleaned)
}
} else {
i++
}
}
return tokens
}
function stem(word: string): string {
if (isCjk(word[0] ?? '')) return word
let s = word
if (s.endsWith('ing') && s.length > 5) s = s.slice(0, -3)
else if (s.endsWith('tion') && s.length > 5) s = s.slice(0, -4)
else if (s.endsWith('ness') && s.length > 5) s = s.slice(0, -4)
else if (s.endsWith('ment') && s.length > 5) s = s.slice(0, -4)
else if (s.endsWith('ers') && s.length > 4) s = s.slice(0, -1)
else if (s.endsWith('er') && s.length > 4) s = s.slice(0, -2)
else if (s.endsWith('es') && s.length > 4) s = s.slice(0, -2)
else if (s.endsWith('s') && s.length > 3 && !s.endsWith('ss'))
s = s.slice(0, -1)
else if (s.endsWith('ed') && s.length > 4) s = s.slice(0, -2)
else if (s.endsWith('ly') && s.length > 4) s = s.slice(0, -2)
return s
}
export function tokenizeAndStem(text: string): string[] {
return tokenize(text).map(stem)
}
const FIELD_WEIGHT = {
name: 3.0,
whenToUse: 2.0,
description: 1.0,
allowedTools: 0.3,
} as const
function computeWeightedTf(
fields: { tokens: string[]; weight: number }[],
): Map<string, number> {
const weighted = new Map<string, number>()
for (const field of fields) {
const freq = new Map<string, number>()
for (const t of field.tokens) freq.set(t, (freq.get(t) ?? 0) + 1)
let max = 1
for (const v of freq.values()) if (v > max) max = v
for (const [term, count] of freq) {
const val = (count / max) * field.weight
const existing = weighted.get(term) ?? 0
if (val > existing) weighted.set(term, val)
}
}
return weighted
}
function computeIdf(index: SkillIndexEntry[]): Map<string, number> {
const df = new Map<string, number>()
for (const entry of index) {
const seen = new Set<string>()
for (const t of entry.tokens) {
if (!seen.has(t)) {
df.set(t, (df.get(t) ?? 0) + 1)
seen.add(t)
}
}
}
const N = index.length
const idf = new Map<string, number>()
for (const [term, count] of df) {
idf.set(term, Math.log(N / count))
}
return idf
}
function cosineSimilarity(
queryTfIdf: Map<string, number>,
docTfIdf: Map<string, number>,
): number {
let dot = 0
let normQ = 0
let normD = 0
for (const [term, qWeight] of queryTfIdf) {
const dWeight = docTfIdf.get(term) ?? 0
dot += qWeight * dWeight
normQ += qWeight * qWeight
}
for (const dWeight of docTfIdf.values()) {
normD += dWeight * dWeight
}
const denom = Math.sqrt(normQ) * Math.sqrt(normD)
return denom === 0 ? 0 : dot / denom
}
const DISPLAY_MIN_SCORE = Number(
process.env.SKILL_SEARCH_DISPLAY_MIN_SCORE ?? '0.10',
)
const NAME_MATCH_MIN_LENGTH = 4
const CJK_MIN_BIGRAM_MATCHES = 2
function normalizeSkillName(name: string): string {
return name.toLowerCase().replace(/[-_]/g, ' ')
}
function splitHyphenatedName(name: string): string[] {
return name
.toLowerCase()
.split(/[-_]/)
.filter(p => p.length >= 3)
}
let cachedIndex: SkillIndexEntry[] | null = null
let cachedIdf: Map<string, number> | null = null
let cachedCwd: string | null = null
export function clearSkillIndexCache(): void {
cachedIndex = null
cachedIdf = null
cachedCwd = null
logForDebugging('[skill-search] index cache cleared')
}
export async function getSkillIndex(cwd: string): Promise<SkillIndexEntry[]> {
if (cachedIndex && cachedCwd === cwd) return cachedIndex
const { getCommands } = await import('../../commands.js')
const commands = await getCommands(cwd)
const entries: SkillIndexEntry[] = []
for (const cmd of commands) {
if ((cmd as Record<string, unknown>).type !== 'prompt') continue
if ((cmd as Record<string, unknown>).disableModelInvocation) continue
const name = cmd.name
const description = cmd.description ?? ''
const whenToUse = (cmd as Record<string, unknown>).whenToUse as
| string
| undefined
const allowedTools =
(
(cmd as Record<string, unknown>).allowedTools as string[] | undefined
)?.join(' ') ?? ''
const nameTokens = tokenizeAndStem(name)
const nameParts = splitHyphenatedName(name)
const nameWithParts = [
...nameTokens,
...nameParts.map(stem).filter(t => !STOP_WORDS.has(t)),
]
const descTokens = tokenizeAndStem(description)
const whenTokens = tokenizeAndStem(whenToUse ?? '')
const toolsTokens = tokenizeAndStem(allowedTools)
const allTokens = [
...new Set([
...nameWithParts,
...descTokens,
...whenTokens,
...toolsTokens,
]),
]
const tfVector = computeWeightedTf([
{ tokens: nameWithParts, weight: FIELD_WEIGHT.name },
{ tokens: whenTokens, weight: FIELD_WEIGHT.whenToUse },
{ tokens: descTokens, weight: FIELD_WEIGHT.description },
{ tokens: toolsTokens, weight: FIELD_WEIGHT.allowedTools },
])
entries.push({
name,
normalizedName: normalizeSkillName(name),
description,
whenToUse,
source: ((cmd as Record<string, unknown>).source as string) ?? 'unknown',
loadedFrom: (cmd as Record<string, unknown>).loadedFrom as
| string
| undefined,
skillRoot: (cmd as Record<string, unknown>).skillRoot as
| string
| undefined,
contentLength: (cmd as Record<string, unknown>).contentLength as
| number
| undefined,
tokens: allTokens,
tfVector,
})
}
const idf = computeIdf(entries)
for (const entry of entries) {
for (const [term, tf] of entry.tfVector) {
entry.tfVector.set(term, tf * (idf.get(term) ?? 0))
}
}
cachedIndex = entries
cachedIdf = idf
cachedCwd = cwd
logForDebugging(
`[skill-search] indexed ${entries.length} skills from ${commands.length} commands`,
)
return entries
}
export function searchSkills(
query: string,
index: SkillIndexEntry[],
limit = 5,
): SearchResult[] {
if (index.length === 0 || !query.trim()) return []
const queryTokens = tokenizeAndStem(query)
if (queryTokens.length === 0) return []
const queryTf = new Map<string, number>()
const freq = new Map<string, number>()
for (const t of queryTokens) freq.set(t, (freq.get(t) ?? 0) + 1)
let max = 1
for (const v of freq.values()) if (v > max) max = v
for (const [term, count] of freq) queryTf.set(term, count / max)
const idf = cachedIdf ?? computeIdf(index)
const queryTfIdf = new Map<string, number>()
for (const [term, tf] of queryTf) {
queryTfIdf.set(term, tf * (idf.get(term) ?? 0))
}
const queryCjkTokens = queryTokens.filter(t => isCjk(t[0] ?? ''))
const queryAsciiTokens = queryTokens.filter(t => !isCjk(t[0] ?? ''))
const queryLower = query.toLowerCase().replace(/[-_]/g, ' ')
const results: SearchResult[] = []
for (const entry of index) {
let score = cosineSimilarity(queryTfIdf, entry.tfVector)
if (queryCjkTokens.length > 0 && score > 0) {
const matchingCjk = queryCjkTokens.filter(t => entry.tfVector.has(t))
if (matchingCjk.length < CJK_MIN_BIGRAM_MATCHES) {
const hasAsciiMatch = queryAsciiTokens.some(t => entry.tfVector.has(t))
if (!hasAsciiMatch) score = 0
}
}
if (entry.name.length >= NAME_MATCH_MIN_LENGTH) {
if (queryLower.includes(entry.normalizedName)) {
score = Math.max(score, 0.75)
}
}
if (score >= DISPLAY_MIN_SCORE) {
results.push({
name: entry.name,
description: entry.description,
score,
source: entry.source,
loadedFrom: entry.loadedFrom,
skillRoot: entry.skillRoot,
contentLength: entry.contentLength,
})
}
}
results.sort((a, b) => b.score - a.score)
return results.slice(0, limit)
}