refactor: 统一自建 Tool Search — 移除 tool_reference/defer_loading 依赖,全 provider 通用

- 重命名 ExecuteTool → ExecuteExtraTool,作为一等工具始终可用
- ToolSearchTool 输出改为纯文本(区分 core/deferred),移除 tool_reference blocks
- 移除 modelSupportsToolReference() 及相关 GrowthBook 配置
- 移除 API 侧 defer_loading 字段和 tool search beta header 注入
- 简化 system prompt(工具使用指南从 ~120 行压缩到 ~10 行)
- extractDiscoveredToolNames 支持文本格式解析(向后兼容旧 session 的 tool_reference)
- 更新 promptEngineeringAudit 测试以匹配简化后的 prompt 结构

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
claude-code-best
2026-05-09 14:19:31 +08:00
parent 4fc95bd5a7
commit 8c157f0767
17 changed files with 280 additions and 401 deletions

View File

@@ -238,30 +238,29 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
// TXT 来源: {request_evaluation_checklist} — Step 0→1→2→3
// ------------------------------------------------------------------
describe('#1 Decision tree for tool selection', () => {
test('prompt contains step-based tool selection guidance', async () => {
test('prompt contains tool selection guidance via dedicated tools', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Step 0')
expect(prompt).toContain('Step 1')
expect(prompt).toContain('Step 2')
expect(prompt).toContain('Step 3')
expect(prompt).toContain('Prefer dedicated tools')
expect(prompt).toContain('Reserve')
expect(prompt).toContain('shell operations')
})
test('decision tree has "stop at the first match" semantics', async () => {
test('guidance distinguishes dedicated tools from Bash', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('stop at the first match')
})
test('Step 0 teaches when NOT to use tools', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Step 0')
expect(prompt).toContain('answer directly, no tool call')
})
test('Step 1 prioritizes dedicated tools over Bash', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Step 1')
expect(prompt).toContain('dedicated tool')
})
test('lists core tools as directly callable', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Core tools')
expect(prompt).toContain('can be called directly')
})
test('provides concrete tool preference examples', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('over cat')
expect(prompt).toContain('over sed')
})
})
// ------------------------------------------------------------------
@@ -271,24 +270,26 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
describe('#2 Anti-pattern guidance (when NOT to use tools)', () => {
test('prompt says when NOT to use tools', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Do NOT use')
const hasAntiPattern =
prompt.includes('Do NOT use') ||
prompt.includes('Reserve') ||
prompt.includes('do not re-attempt')
expect(hasAntiPattern).toBe(true)
})
test('includes explicit "Do not use tools when" section', async () => {
test('guidance covers Bash misuse', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Do not use tools when')
const hasBashGuidance =
prompt.includes('Reserve') && prompt.includes('shell operations')
expect(hasBashGuidance).toBe(true)
})
test('anti-pattern covers knowledge questions', async () => {
test('anti-pattern covers file creation', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain(
'programming concepts, syntax, or design patterns',
)
})
test('anti-pattern covers content already in context', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('already visible in context')
const hasFileAntiPattern =
prompt.includes('Do not create files unless') ||
prompt.includes('prefer editing an existing file')
expect(hasFileAntiPattern).toBe(true)
})
test('includes file creation anti-pattern', async () => {
@@ -305,24 +306,25 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
// TXT 来源: {core_search_behaviors}, {past_chats_tools}
// ------------------------------------------------------------------
describe('#6 Progressive fallback chain', () => {
test('Grep/Glob fallback chain exists', async () => {
test('prompt encourages searching before asking user', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('fallback chain')
expect(prompt).toContain('search with')
})
test('fallback includes broader pattern as first retry', async () => {
test('search tools are available for discovery', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Broader pattern')
expect(prompt).toContain('Grep')
expect(prompt).toContain('Glob')
})
test('fallback includes alternate naming conventions', async () => {
test('fallback includes escalating to user via AskUserQuestion', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('camelCase vs snake_case')
expect(prompt).toContain('AskUserQuestion')
})
test('fallback ends with asking user after exhaustion', async () => {
test('search before saying unknown is present', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('ask for guidance')
expect(prompt).toContain('Search before saying unknown')
})
})
@@ -331,30 +333,33 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
// TXT 来源: {examples}, {visualizer_examples}, {past_chats_tools}
// ------------------------------------------------------------------
describe('#3 Few-shot examples', () => {
test('contains tool selection examples with arrow notation', async () => {
test('contains concrete tool preference examples', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('→')
expect(prompt).toContain('Tool selection examples')
})
test('has multiple concrete Request→Action pairs (>=5)', async () => {
const prompt = await getFullPrompt()
const arrowCount = (prompt.match(/[""].+?[""] → /g) || []).length
expect(arrowCount).toBeGreaterThanOrEqual(5)
const hasExamples =
prompt.includes('over cat') || prompt.includes('over sed')
expect(hasExamples).toBe(true)
})
test('examples cover different tool types', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Glob("**/*.tsx")')
expect(prompt).toContain('Bash("bun test")')
expect(prompt).toContain('Grep("TODO")')
expect(prompt).toContain('answer directly')
expect(prompt).toContain('Read')
expect(prompt).toContain('Edit')
expect(prompt).toContain('Grep')
})
test('examples include negative cases (what NOT to use)', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('not Bash find')
expect(prompt).toContain('not Bash sed')
const hasNegative =
prompt.includes('over cat') ||
prompt.includes('over sed') ||
prompt.includes('over find') ||
prompt.includes('over grep')
expect(hasNegative).toBe(true)
})
test('core tools are enumerated', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Core tools')
})
})
@@ -392,16 +397,18 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
expect(prompt).toContain('cost of pausing to confirm is low')
})
test('frames search tools as cheap', async () => {
test('guidance encourages searching over guessing', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('cheap operations')
const hasSearchGuidance =
prompt.includes('Search before saying unknown') ||
prompt.includes('search with')
expect(hasSearchGuidance).toBe(true)
})
test('expanded cost asymmetry with multiple scenarios', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Cost asymmetry principle')
expect(prompt).toContain('costs user trust')
expect(prompt).toContain('breaks their flow')
// Simplified prompt conveys cost via "search before saying unknown"
expect(prompt).toContain('search with')
})
})
@@ -432,32 +439,24 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
// TXT 来源: {search_usage_guidelines}, {past_chats_tools}
// ------------------------------------------------------------------
describe('#8 Query construction guidance', () => {
test('includes Grep query construction advice', async () => {
test('Grep is mentioned as a search tool', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('query construction')
expect(prompt).toContain('content words')
expect(prompt).toContain('Grep')
})
test('Grep guidance teaches content words vs meta-descriptions', async () => {
test('Glob is mentioned as a search tool', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('authenticate|login|signIn')
expect(prompt).toContain('not "auth handling code"')
expect(prompt).toContain('Glob')
})
test('Grep guidance teaches pipe alternation for naming variants', async () => {
test('search tools are referenced in "Search before saying unknown"', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('userId|user_id|userID')
expect(prompt).toContain('Search before saying unknown')
})
test('includes Glob query construction advice', async () => {
test('dedicated tools are preferred over Bash equivalents', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Glob query construction')
expect(prompt).toContain('**/*Auth*.ts')
})
test('Glob guidance teaches narrowing by extension', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('**/*.test.ts')
expect(prompt).toContain('Prefer dedicated tools')
})
})
@@ -491,16 +490,15 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
// TXT 来源: {tool_discovery}, {core_search_behaviors}
// ------------------------------------------------------------------
describe('#10 Multi-step search strategy', () => {
test('scales search effort to task complexity', async () => {
test('encourages searching before concluding', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Scale search effort to task complexity')
expect(prompt).toContain('Search before saying unknown')
})
test('gives concrete complexity tiers', async () => {
test('provides multiple search tools for different scopes', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Single file fix')
expect(prompt).toContain('Cross-cutting change')
expect(prompt).toContain('Architecture investigation')
expect(prompt).toContain('Grep')
expect(prompt).toContain('Glob')
})
})
@@ -530,12 +528,12 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
describe('#22 Search before saying unknown', () => {
test('instructs to search before claiming something does not exist', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('Search first, report results second')
expect(prompt).toContain('Search before saying unknown')
})
test('explicitly says do not say "I don\'t see that file"', async () => {
test('core tools are listed as always available', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain("don't see that file")
expect(prompt).toContain('call them directly')
})
})
@@ -663,9 +661,9 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
test('tool_discovery: search before saying unavailable', async () => {
const prompt = await getFullPrompt()
expect(prompt).toContain('visible tool list is partial by design')
expect(prompt).toContain('search for it')
expect(prompt).toContain(
'Only state something is unavailable after the search returns no match',
'Only state something is unavailable after ToolSearch returns no match',
)
})