refactor: 统一自建 Tool Search — 移除 tool_reference/defer_loading 依赖，全 provider 通用

- 重命名 ExecuteTool → ExecuteExtraTool，作为一等工具始终可用 - ToolSearchTool 输出改为纯文本（区分 core/deferred），移除 tool_reference blocks - 移除 modelSupportsToolReference() 及相关 GrowthBook 配置 - 移除 API 侧 defer_loading 字段和 tool search beta header 注入 - 简化 system prompt（工具使用指南从 ~120 行压缩到 ~10 行） - extractDiscoveredToolNames 支持文本格式解析（向后兼容旧 session 的 tool_reference） - 更新 promptEngineeringAudit 测试以匹配简化后的 prompt 结构 Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-22 16:25:51 +00:00 · 2026-05-09 14:19:31 +08:00
parent 4fc95bd5a7
commit 8c157f0767
17 changed files with 280 additions and 401 deletions
--- a/src/constants/promptEngineeringAudit.runner.ts
+++ b/src/constants/promptEngineeringAudit.runner.ts
@@ -238,30 +238,29 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
  // TXT 来源: {request_evaluation_checklist} — Step 0→1→2→3
  // ------------------------------------------------------------------
  describe('#1 Decision tree for tool selection', () => {
-    test('prompt contains step-based tool selection guidance', async () => {
+    test('prompt contains tool selection guidance via dedicated tools', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Step 0')
-      expect(prompt).toContain('Step 1')
-      expect(prompt).toContain('Step 2')
-      expect(prompt).toContain('Step 3')
+      expect(prompt).toContain('Prefer dedicated tools')
+      expect(prompt).toContain('Reserve')
+      expect(prompt).toContain('shell operations')
    })

-    test('decision tree has "stop at the first match" semantics', async () => {
+    test('guidance distinguishes dedicated tools from Bash', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('stop at the first match')
-    })
-
-    test('Step 0 teaches when NOT to use tools', async () => {
-      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Step 0')
-      expect(prompt).toContain('answer directly, no tool call')
-    })
-
-    test('Step 1 prioritizes dedicated tools over Bash', async () => {
-      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Step 1')
      expect(prompt).toContain('dedicated tool')
    })
+
+    test('lists core tools as directly callable', async () => {
+      const prompt = await getFullPrompt()
+      expect(prompt).toContain('Core tools')
+      expect(prompt).toContain('can be called directly')
+    })
+
+    test('provides concrete tool preference examples', async () => {
+      const prompt = await getFullPrompt()
+      expect(prompt).toContain('over cat')
+      expect(prompt).toContain('over sed')
+    })
  })

  // ------------------------------------------------------------------
@@ -271,24 +270,26 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
  describe('#2 Anti-pattern guidance (when NOT to use tools)', () => {
    test('prompt says when NOT to use tools', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Do NOT use')
+      const hasAntiPattern =
+        prompt.includes('Do NOT use') ||
+        prompt.includes('Reserve') ||
+        prompt.includes('do not re-attempt')
+      expect(hasAntiPattern).toBe(true)
    })

-    test('includes explicit "Do not use tools when" section', async () => {
+    test('guidance covers Bash misuse', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Do not use tools when')
+      const hasBashGuidance =
+        prompt.includes('Reserve') && prompt.includes('shell operations')
+      expect(hasBashGuidance).toBe(true)
    })

-    test('anti-pattern covers knowledge questions', async () => {
+    test('anti-pattern covers file creation', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain(
-        'programming concepts, syntax, or design patterns',
-      )
-    })
-
-    test('anti-pattern covers content already in context', async () => {
-      const prompt = await getFullPrompt()
-      expect(prompt).toContain('already visible in context')
+      const hasFileAntiPattern =
+        prompt.includes('Do not create files unless') ||
+        prompt.includes('prefer editing an existing file')
+      expect(hasFileAntiPattern).toBe(true)
    })

    test('includes file creation anti-pattern', async () => {
@@ -305,24 +306,25 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
  // TXT 来源: {core_search_behaviors}, {past_chats_tools}
  // ------------------------------------------------------------------
  describe('#6 Progressive fallback chain', () => {
-    test('Grep/Glob fallback chain exists', async () => {
+    test('prompt encourages searching before asking user', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('fallback chain')
+      expect(prompt).toContain('search with')
    })

-    test('fallback includes broader pattern as first retry', async () => {
+    test('search tools are available for discovery', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Broader pattern')
+      expect(prompt).toContain('Grep')
+      expect(prompt).toContain('Glob')
    })

-    test('fallback includes alternate naming conventions', async () => {
+    test('fallback includes escalating to user via AskUserQuestion', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('camelCase vs snake_case')
+      expect(prompt).toContain('AskUserQuestion')
    })

-    test('fallback ends with asking user after exhaustion', async () => {
+    test('search before saying unknown is present', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('ask for guidance')
+      expect(prompt).toContain('Search before saying unknown')
    })
  })

@@ -331,30 +333,33 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
  // TXT 来源: {examples}, {visualizer_examples}, {past_chats_tools}
  // ------------------------------------------------------------------
  describe('#3 Few-shot examples', () => {
-    test('contains tool selection examples with arrow notation', async () => {
+    test('contains concrete tool preference examples', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('→')
-      expect(prompt).toContain('Tool selection examples')
-    })
-
-    test('has multiple concrete Request→Action pairs (>=5)', async () => {
-      const prompt = await getFullPrompt()
-      const arrowCount = (prompt.match(/[""].+?[""] → /g) || []).length
-      expect(arrowCount).toBeGreaterThanOrEqual(5)
+      const hasExamples =
+        prompt.includes('over cat') || prompt.includes('over sed')
+      expect(hasExamples).toBe(true)
    })

    test('examples cover different tool types', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Glob("**/*.tsx")')
-      expect(prompt).toContain('Bash("bun test")')
-      expect(prompt).toContain('Grep("TODO")')
-      expect(prompt).toContain('answer directly')
+      expect(prompt).toContain('Read')
+      expect(prompt).toContain('Edit')
+      expect(prompt).toContain('Grep')
    })

    test('examples include negative cases (what NOT to use)', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('not Bash find')
-      expect(prompt).toContain('not Bash sed')
+      const hasNegative =
+        prompt.includes('over cat') ||
+        prompt.includes('over sed') ||
+        prompt.includes('over find') ||
+        prompt.includes('over grep')
+      expect(hasNegative).toBe(true)
+    })
+
+    test('core tools are enumerated', async () => {
+      const prompt = await getFullPrompt()
+      expect(prompt).toContain('Core tools')
    })
  })

@@ -392,16 +397,18 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
      expect(prompt).toContain('cost of pausing to confirm is low')
    })

-    test('frames search tools as cheap', async () => {
+    test('guidance encourages searching over guessing', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('cheap operations')
+      const hasSearchGuidance =
+        prompt.includes('Search before saying unknown') ||
+        prompt.includes('search with')
+      expect(hasSearchGuidance).toBe(true)
    })

    test('expanded cost asymmetry with multiple scenarios', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Cost asymmetry principle')
-      expect(prompt).toContain('costs user trust')
-      expect(prompt).toContain('breaks their flow')
+      // Simplified prompt conveys cost via "search before saying unknown"
+      expect(prompt).toContain('search with')
    })
  })

@@ -432,32 +439,24 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
  // TXT 来源: {search_usage_guidelines}, {past_chats_tools}
  // ------------------------------------------------------------------
  describe('#8 Query construction guidance', () => {
-    test('includes Grep query construction advice', async () => {
+    test('Grep is mentioned as a search tool', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('query construction')
-      expect(prompt).toContain('content words')
+      expect(prompt).toContain('Grep')
    })

-    test('Grep guidance teaches content words vs meta-descriptions', async () => {
+    test('Glob is mentioned as a search tool', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('authenticate|login|signIn')
-      expect(prompt).toContain('not "auth handling code"')
+      expect(prompt).toContain('Glob')
    })

-    test('Grep guidance teaches pipe alternation for naming variants', async () => {
+    test('search tools are referenced in "Search before saying unknown"', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('userId|user_id|userID')
+      expect(prompt).toContain('Search before saying unknown')
    })

-    test('includes Glob query construction advice', async () => {
+    test('dedicated tools are preferred over Bash equivalents', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Glob query construction')
-      expect(prompt).toContain('**/*Auth*.ts')
-    })
-
-    test('Glob guidance teaches narrowing by extension', async () => {
-      const prompt = await getFullPrompt()
-      expect(prompt).toContain('**/*.test.ts')
+      expect(prompt).toContain('Prefer dedicated tools')
    })
  })

@@ -491,16 +490,15 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
  // TXT 来源: {tool_discovery}, {core_search_behaviors}
  // ------------------------------------------------------------------
  describe('#10 Multi-step search strategy', () => {
-    test('scales search effort to task complexity', async () => {
+    test('encourages searching before concluding', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Scale search effort to task complexity')
+      expect(prompt).toContain('Search before saying unknown')
    })

-    test('gives concrete complexity tiers', async () => {
+    test('provides multiple search tools for different scopes', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Single file fix')
-      expect(prompt).toContain('Cross-cutting change')
-      expect(prompt).toContain('Architecture investigation')
+      expect(prompt).toContain('Grep')
+      expect(prompt).toContain('Glob')
    })
  })

@@ -530,12 +528,12 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {
  describe('#22 Search before saying unknown', () => {
    test('instructs to search before claiming something does not exist', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('Search first, report results second')
+      expect(prompt).toContain('Search before saying unknown')
    })

-    test('explicitly says do not say "I don\'t see that file"', async () => {
+    test('core tools are listed as always available', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain("don't see that file")
+      expect(prompt).toContain('call them directly')
    })
  })

@@ -663,9 +661,9 @@ describe('Opus 4.7 Prompt Engineering Audit', () => {

    test('tool_discovery: search before saying unavailable', async () => {
      const prompt = await getFullPrompt()
-      expect(prompt).toContain('visible tool list is partial by design')
+      expect(prompt).toContain('search for it')
      expect(prompt).toContain(
-        'Only state something is unavailable after the search returns no match',
+        'Only state something is unavailable after ToolSearch returns no match',
      )
    })