feat: 修正 web search 工具

This commit is contained in:
claude-code-best
2026-04-03 00:47:37 +08:00
parent d04e00fc2c
commit e48da3956c
13 changed files with 1241 additions and 270 deletions

View File

@@ -384,7 +384,7 @@ export async function getURLMarkdownContent(
// This is for enterprise customers with restrictive security policies
// that prevent outbound connections to claude.ai
const settings = getSettings_DEPRECATED()
if (!settings.skipWebFetchPreflight) {
if (settings.skipWebFetchPreflight === false) {
const checkResult = await checkDomainBlocklist(hostname)
switch (checkResult.status) {
case 'allowed':

View File

@@ -1,19 +1,9 @@
import type {
BetaContentBlock,
BetaWebSearchTool20250305,
} from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
import { getAPIProvider } from 'src/utils/model/providers.js'
import type { PermissionResult } from 'src/utils/permissions/PermissionResult.js'
import { z } from 'zod/v4'
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
import { queryModelWithStreaming } from '../../services/api/claude.js'
import { buildTool, type ToolDef } from '../../Tool.js'
import { lazySchema } from '../../utils/lazySchema.js'
import { logError } from '../../utils/log.js'
import { createUserMessage } from '../../utils/messages.js'
import { getMainLoopModel, getSmallFastModel } from '../../utils/model/model.js'
import { jsonParse, jsonStringify } from '../../utils/slowOperations.js'
import { asSystemPrompt } from '../../utils/systemPromptType.js'
import { jsonStringify } from '../../utils/slowOperations.js'
import { createAdapter } from './adapters/index.js'
import { getWebSearchPrompt, WEB_SEARCH_TOOL_NAME } from './prompt.js'
import {
getToolUseSummary,
@@ -37,12 +27,11 @@ const inputSchema = lazySchema(() =>
)
type InputSchema = ReturnType<typeof inputSchema>
type Input = z.infer<InputSchema>
const searchResultSchema = lazySchema(() => {
const searchHitSchema = z.object({
title: z.string().describe('The title of the search result'),
url: z.string().describe('The URL of the search result'),
snippet: z.string().optional().describe('A short description of the search result'),
})
return z.object({
@@ -73,82 +62,6 @@ export type { WebSearchProgress } from '../../types/tools.js'
import type { WebSearchProgress } from '../../types/tools.js'
function makeToolSchema(input: Input): BetaWebSearchTool20250305 {
return {
type: 'web_search_20250305',
name: 'web_search',
allowed_domains: input.allowed_domains,
blocked_domains: input.blocked_domains,
max_uses: 8, // Hardcoded to 8 searches maximum
}
}
function makeOutputFromSearchResponse(
result: BetaContentBlock[],
query: string,
durationSeconds: number,
): Output {
// The result is a sequence of these blocks:
// - text to start -- always?
// [
// - server_tool_use
// - web_search_tool_result
// - text and citation blocks intermingled
// ]+ (this block repeated for each search)
const results: (SearchResult | string)[] = []
let textAcc = ''
let inText = true
for (const block of result) {
if (block.type === 'server_tool_use') {
if (inText) {
inText = false
if (textAcc.trim().length > 0) {
results.push(textAcc.trim())
}
textAcc = ''
}
continue
}
if (block.type === 'web_search_tool_result') {
// Handle error case - content is a WebSearchToolResultError
if (!Array.isArray(block.content)) {
const errorMessage = `Web search error: ${block.content.error_code}`
logError(new Error(errorMessage))
results.push(errorMessage)
continue
}
// Success case - add results to our collection
const hits = block.content.map(r => ({ title: r.title, url: r.url }))
results.push({
tool_use_id: block.tool_use_id,
content: hits,
})
}
if (block.type === 'text') {
if (inText) {
textAcc += block.text
} else {
inText = true
textAcc = block.text
}
}
}
if (textAcc.length) {
results.push(textAcc.trim())
}
return {
query,
results,
durationSeconds,
}
}
export const WebSearchTool = buildTool({
name: WEB_SEARCH_TOOL_NAME,
searchHint: 'search the web for current information',
@@ -166,30 +79,9 @@ export const WebSearchTool = buildTool({
return summary ? `Searching for ${summary}` : 'Searching the web'
},
isEnabled() {
const provider = getAPIProvider()
const model = getMainLoopModel()
// Enable for firstParty
if (provider === 'firstParty') {
return true
}
// Enable for Vertex AI with supported models (Claude 4.0+)
if (provider === 'vertex') {
const supportsWebSearch =
model.includes('claude-opus-4') ||
model.includes('claude-sonnet-4') ||
model.includes('claude-haiku-4')
return supportsWebSearch
}
// Foundry only ships models that already support Web Search
if (provider === 'foundry') {
return true
}
return false
// Always enabled — the adapter factory selects the appropriate backend
// (API server-side search or Bing fallback) based on provider capabilities.
return true
},
get inputSchema(): InputSchema {
return inputSchema()
@@ -227,9 +119,6 @@ export const WebSearchTool = buildTool({
renderToolUseProgressMessage,
renderToolResultMessage,
extractSearchText() {
// renderToolResultMessage shows only "Did N searches in Xs" chrome —
// the results[] content never appears on screen. Heuristic would index
// string entries in results[] (phantom match). Nothing to search.
return ''
},
async validateInput(input) {
@@ -254,149 +143,42 @@ export const WebSearchTool = buildTool({
async call(input, context, _canUseTool, _parentMessage, onProgress) {
const startTime = performance.now()
const { query } = input
const userMessage = createUserMessage({
content: 'Perform a web search for the query: ' + query,
})
const toolSchema = makeToolSchema(input)
const useHaiku = getFeatureValue_CACHED_MAY_BE_STALE(
'tengu_plum_vx3',
false,
)
const appState = context.getAppState()
const queryStream = queryModelWithStreaming({
messages: [userMessage],
systemPrompt: asSystemPrompt([
'You are an assistant for performing a web search tool use',
]),
thinkingConfig: useHaiku
? { type: 'disabled' as const }
: context.options.thinkingConfig,
tools: [],
const adapter = createAdapter()
const adapterResults = await adapter.search(query, {
allowedDomains: input.allowed_domains,
blockedDomains: input.blocked_domains,
signal: context.abortController.signal,
options: {
getToolPermissionContext: async () => appState.toolPermissionContext,
model: useHaiku ? getSmallFastModel() : context.options.mainLoopModel,
toolChoice: useHaiku ? { type: 'tool', name: 'web_search' } : undefined,
isNonInteractiveSession: context.options.isNonInteractiveSession,
hasAppendSystemPrompt: !!context.options.appendSystemPrompt,
extraToolSchemas: [toolSchema],
querySource: 'web_search_tool',
agents: context.options.agentDefinitions.activeAgents,
mcpTools: [],
agentId: context.agentId,
effortValue: appState.effortValue,
onProgress(progress) {
if (onProgress) {
const progressCounter = Date.now()
onProgress({
toolUseID: `search-progress-${progressCounter}`,
data: progress,
})
}
},
})
const allContentBlocks: BetaContentBlock[] = []
let currentToolUseId = null
let currentToolUseJson = ''
let progressCounter = 0
const toolUseQueries = new Map() // Map of tool_use_id to query
for await (const event of queryStream) {
if (event.type === 'assistant') {
const msg = event as { message: { content: BetaContentBlock[] } }
allContentBlocks.push(...msg.message.content)
continue
}
// Track tool use ID when server_tool_use starts
if (
event.type === 'stream_event'
) {
const streamEvt = event as { event?: { type: string; content_block?: { type: string; id?: string; tool_use_id?: string; content?: unknown; [key: string]: unknown }; delta?: { type: string; partial_json?: string; [key: string]: unknown }; [key: string]: unknown } }
if (streamEvt.event?.type === 'content_block_start') {
const contentBlock = streamEvt.event.content_block
if (contentBlock && contentBlock.type === 'server_tool_use') {
currentToolUseId = contentBlock.id as string
currentToolUseJson = ''
// Note: The ServerToolUseBlock doesn't contain input.query
// The actual query comes through input_json_delta events
continue
}
}
// Accumulate JSON for current tool use
if (
currentToolUseId &&
streamEvt.event?.type === 'content_block_delta'
) {
const delta = streamEvt.event.delta
if (delta?.type === 'input_json_delta' && delta.partial_json) {
currentToolUseJson += delta.partial_json
// Try to extract query from partial JSON for progress updates
try {
// Look for a complete query field
const queryMatch = currentToolUseJson.match(
/"query"\s*:\s*"((?:[^"\\]|\\.)*)"/,
)
if (queryMatch && queryMatch[1]) {
// The regex properly handles escaped characters
const query = jsonParse('"' + queryMatch[1] + '"')
if (
!toolUseQueries.has(currentToolUseId) ||
toolUseQueries.get(currentToolUseId) !== query
) {
toolUseQueries.set(currentToolUseId, query)
progressCounter++
if (onProgress) {
onProgress({
toolUseID: `search-progress-${progressCounter}`,
data: {
type: 'query_update',
query,
},
})
}
}
}
} catch {
// Ignore parsing errors for partial JSON
}
}
}
// Yield progress when search results come in
if (
streamEvt.event?.type === 'content_block_start'
) {
const contentBlock = streamEvt.event.content_block
if (contentBlock && contentBlock.type === 'web_search_tool_result') {
// Get the actual query that was used for this search
const toolUseId = contentBlock.tool_use_id
const actualQuery = toolUseQueries.get(toolUseId) || query
const content = contentBlock.content
progressCounter++
if (onProgress) {
onProgress({
toolUseID: toolUseId || `search-progress-${progressCounter}`,
data: {
type: 'search_results_received',
resultCount: Array.isArray(content) ? content.length : 0,
query: actualQuery,
},
})
}
}
}
} // end stream_event
}
// Process the final result
const endTime = performance.now()
const durationSeconds = (endTime - startTime) / 1000
const data = makeOutputFromSearchResponse(
allContentBlocks,
// Convert adapter SearchResult[] to legacy Output format
const results: (SearchResult | string)[] = []
if (adapterResults.length > 0) {
results.push({
tool_use_id: 'adapter-search-1',
content: adapterResults.map(r => ({ title: r.title, url: r.url, snippet: r.snippet })),
})
} else {
results.push('No search results found.')
}
const data: Output = {
query,
results,
durationSeconds,
)
}
return { data }
},
mapToolResultToToolResultBlockParam(output, toolUseID) {
@@ -404,20 +186,23 @@ export const WebSearchTool = buildTool({
let formattedOutput = `Web search results for query: "${query}"\n\n`
// Process the results array - it can contain both string summaries and search result objects.
// Guard against null/undefined entries that can appear after JSON round-tripping
// (e.g., from compaction or transcript deserialization).
;(results ?? []).forEach(result => {
if (result == null) {
return
}
if (typeof result === 'string') {
// Text summary
formattedOutput += result + '\n\n'
} else {
// Search result with links
if (result.content?.length > 0) {
formattedOutput += `Links: ${jsonStringify(result.content)}\n\n`
formattedOutput += 'Links:\n'
for (const link of result.content) {
formattedOutput += ` - [${link.title}](${link.url})`
if (link.snippet) {
formattedOutput += `: ${link.snippet}`
}
formattedOutput += '\n'
}
formattedOutput += '\n'
} else {
formattedOutput += 'No links found.\n\n'
}

View File

@@ -0,0 +1,82 @@
/**
* Integration test for BingSearchAdapter — hits the real Bing search.
*
* Usage:
* bun run src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts
*
* Optional env vars:
* BING_QUERY — search query (default: "Claude AI Anthropic")
*/
// Provide MACRO globals needed by the codebase when running outside dev mode
if (!globalThis.MACRO) {
globalThis.MACRO = { VERSION: '0.0.0-test', BUILD_TIME: '0' } as any
}
import { BingSearchAdapter, extractBingResults } from '../adapters/bingAdapter'
const query = process.env.BING_QUERY || 'Claude AI Anthropic'
async function main() {
console.log(`\n🔍 Searching Bing for: "${query}"\n`)
const adapter = new BingSearchAdapter()
const startTime = Date.now()
const results = await adapter.search(query, {
onProgress: (p) => {
if (p.type === 'query_update') {
console.log(` → Query sent: ${p.query}`)
}
if (p.type === 'search_results_received') {
console.log(` → Received ${p.resultCount} results`)
}
},
})
const elapsed = Date.now() - startTime
console.log(`\n✅ Done in ${elapsed}ms — ${results.length} result(s)\n`)
if (results.length === 0) {
console.log('⚠️ No results returned. Possible causes:')
console.log(' - Bing returned a CAPTCHA or rate-limited the request')
console.log(' - Network/firewall issue')
console.log(' - Bing HTML structure changed')
console.log(' - Anti-bot detection triggered\n')
process.exit(1)
}
for (const [i, r] of results.entries()) {
console.log(` ${i + 1}. ${r.title}`)
console.log(` ${r.url}`)
if (r.snippet) {
const snippet = r.snippet.replace(/\n/g, ' ')
console.log(` ${snippet.slice(0, 150)}${snippet.length > 150 ? '…' : ''}`)
}
console.log()
}
// Validate result structure
let passed = true
for (const [i, r] of results.entries()) {
if (!r.title || typeof r.title !== 'string') {
console.error(`❌ Result ${i + 1}: missing or non-string title`, r)
passed = false
}
if (!r.url || !r.url.startsWith('http')) {
console.error(`❌ Result ${i + 1}: missing or non-http url`, r)
passed = false
}
}
if (passed) {
console.log('✅ All results have valid structure.\n')
} else {
process.exit(1)
}
}
main().catch((e) => {
console.error('❌ Fatal error:', e)
process.exit(1)
})

View File

@@ -0,0 +1,499 @@
import { describe, expect, mock, test } from 'bun:test'
import { extractBingResults, decodeHtmlEntities } from '../adapters/bingAdapter'
// ---------------------------------------------------------------------------
// decodeHtmlEntities
// ---------------------------------------------------------------------------
describe('decodeHtmlEntities', () => {
test('decodes common named entities', () => {
expect(decodeHtmlEntities('&amp; &lt; &gt;')).toBe('& < >')
})
test('decodes quote entities', () => {
expect(decodeHtmlEntities('&quot;hello&quot;')).toBe('"hello"')
})
test('decodes numeric and hex apostrophe entities', () => {
expect(decodeHtmlEntities('&#39;it&#x27;s')).toBe("'it's")
})
test('decodes &nbsp; to non-breaking space (\\u00A0)', () => {
expect(decodeHtmlEntities('a&nbsp;b')).toBe('a\u00A0b')
})
test('returns plain text unchanged', () => {
expect(decodeHtmlEntities('hello world')).toBe('hello world')
})
test('handles empty string', () => {
expect(decodeHtmlEntities('')).toBe('')
})
test('decodes multiple occurrences of the same entity', () => {
expect(decodeHtmlEntities('a&amp;b&amp;c')).toBe('a&b&c')
})
test('handles mixed entities in one string', () => {
expect(decodeHtmlEntities('&lt;a&nbsp;href=&quot;x&quot;&gt;')).toBe('<a\u00A0href="x">')
})
})
// ---------------------------------------------------------------------------
// extractBingResults
// ---------------------------------------------------------------------------
describe('extractBingResults', () => {
test('extracts results from standard Bing HTML', () => {
const html = `
<ol id="b_results">
<li class="b_algo">
<h2><a href="https://example.com/page1" h="ID=SERP,1">Example Title 1</a></h2>
<div class="b_caption">
<p class="b_lineclamp">First result description</p>
</div>
</li>
<li class="b_algo">
<h2><a href="https://example.com/page2" h="ID=SERP,2">Example Title 2</a></h2>
<div class="b_caption">
<p class="b_lineclamp">Second result description</p>
</div>
</li>
</ol>
`
const results = extractBingResults(html)
expect(results).toHaveLength(2)
expect(results[0]).toEqual({
title: 'Example Title 1',
url: 'https://example.com/page1',
snippet: 'First result description',
})
expect(results[1]).toEqual({
title: 'Example Title 2',
url: 'https://example.com/page2',
snippet: 'Second result description',
})
})
test('returns empty array when no b_algo blocks exist', () => {
const html = `
<ol id="b_results">
<li class="b_ad">Ad result</li>
<li class="b_ans">Answer card</li>
</ol>
`
expect(extractBingResults(html)).toEqual([])
})
test('returns empty array for empty HTML', () => {
expect(extractBingResults('')).toEqual([])
})
test('returns empty array for unrelated HTML', () => {
expect(extractBingResults('<html><body>Hello</body></html>')).toEqual([])
})
test('skips Bing-internal links', () => {
const html = `
<li class="b_algo">
<h2><a href="/search?q=more">More results</a></h2>
</li>
<li class="b_algo">
<h2><a href="https://www.bing.com/videos">Bing Videos</a></h2>
</li>
<li class="b_algo">
<h2><a href="#anchor">Jump link</a></h2>
</li>
`
expect(extractBingResults(html)).toEqual([])
})
test('strips HTML tags from titles', () => {
const html = `
<li class="b_algo">
<h2><a href="https://example.com">Result with <strong>bold</strong> and <em>italic</em></a></h2>
</li>
`
const results = extractBingResults(html)
expect(results).toHaveLength(1)
expect(results[0].title).toBe('Result with bold and italic')
})
test('decodes HTML entities in titles', () => {
const html = `
<li class="b_algo">
<h2><a href="https://example.com">Tom &amp; Jerry &lt;cartoon&gt;</a></h2>
</li>
`
const results = extractBingResults(html)
expect(results[0].title).toBe('Tom & Jerry <cartoon>')
})
test('extracts snippet from b_lineclamp class', () => {
const html = `
<li class="b_algo">
<h2><a href="https://example.com">Title</a></h2>
<p class="b_lineclamp3 b_algo_slug">Lineclamp snippet text here</p>
</li>
`
const results = extractBingResults(html)
expect(results[0].snippet).toBe('Lineclamp snippet text here')
})
test('extracts snippet from b_caption paragraph fallback', () => {
const html = `
<li class="b_algo">
<h2><a href="https://example.com">Title</a></h2>
<div class="b_caption">
<p>Caption paragraph text</p>
</div>
</li>
`
const results = extractBingResults(html)
expect(results[0].snippet).toBe('Caption paragraph text')
})
test('extracts snippet from b_caption div fallback', () => {
const html = `
<li class="b_algo">
<h2><a href="https://example.com">Title</a></h2>
<div class="b_caption">Direct caption text without p tag</div>
</li>
`
const results = extractBingResults(html)
expect(results[0].snippet).toBe('Direct caption text without p tag')
})
test('returns undefined snippet when no caption exists', () => {
const html = `
<li class="b_algo">
<h2><a href="https://example.com">Title Only</a></h2>
</li>
`
const results = extractBingResults(html)
expect(results[0].snippet).toBeUndefined()
})
test('handles mixed result types and only extracts b_algo', () => {
const html = `
<ol id="b_results">
<li class="b_ad"><h2><a href="https://ad.com">Ad Title</a></h2></li>
<li class="b_algo">
<h2><a href="https://real-result.com">Real Result</a></h2>
<p class="b_lineclamp">A real snippet</p>
</li>
<li class="b_ans"><div>People also ask</div></li>
<li class="b_algo">
<h2><a href="https://another.com">Another Result</a></h2>
</li>
</ol>
`
const results = extractBingResults(html)
expect(results).toHaveLength(2)
expect(results[0].title).toBe('Real Result')
expect(results[1].title).toBe('Another Result')
})
test('skips b_algo blocks without h2 > a structure', () => {
const html = `
<li class="b_algo">
<div>No link here</div>
</li>
<li class="b_algo">
<h2><a href="https://example.com">Valid Result</a></h2>
</li>
`
const results = extractBingResults(html)
expect(results).toHaveLength(1)
expect(results[0].title).toBe('Valid Result')
})
test('handles extra whitespace in h2 > a structure', () => {
const html = `
<li class="b_algo">
<h2>
<a href="https://example.com" h="ID=SERP,1" >
Whitespace Title
</a>
</h2>
</li>
`
const results = extractBingResults(html)
expect(results).toHaveLength(1)
expect(results[0].title).toBe('Whitespace Title')
})
test('handles snippet with HTML entities', () => {
const html = `
<li class="b_algo">
<h2><a href="https://example.com">Title</a></h2>
<p class="b_lineclamp">5 &lt; 10 &amp; 10 &gt; 5</p>
</li>
`
const results = extractBingResults(html)
expect(results[0].snippet).toBe('5 < 10 & 10 > 5')
})
test('handles real-world Bing HTML structure', () => {
const html = `
<ol id="b_results" role="main">
<li class="b_algo" data-id="">
<div class="b_title">
<h2>
<a href="https://docs.python.org/3/tutorial/index.html" target="_blank" h="ID=SERP,5125.1">
Python Tutorial
</a>
</h2>
</div>
<div class="b_caption">
<div class="b_attribution" u="0|5125|4976674477245">
<cite>https://docs.python.org</cite>
</div>
<p class="b_lineclamp3">
Welcome to the Python Tutorial. This tutorial introduces you to the basic concepts and features...
</p>
</div>
</li>
<li class="b_algo">
<h2>
<a href="https://realpython.com/python-guide/" h="ID=SERP,5125.2">
Real Python Guide
</a>
</h2>
<div class="b_caption">
<div class="b_attribution">
<cite>https://realpython.com</cite>
</div>
<p>
The ultimate Python guide for beginners and experts alike.
</p>
</div>
</li>
</ol>
`
const results = extractBingResults(html)
expect(results).toHaveLength(2)
expect(results[0].title).toBe('Python Tutorial')
expect(results[0].url).toBe('https://docs.python.org/3/tutorial/index.html')
expect(results[0].snippet).toContain('Welcome to the Python Tutorial')
expect(results[1].title).toBe('Real Python Guide')
expect(results[1].snippet).toContain('ultimate Python guide')
})
})
// ---------------------------------------------------------------------------
// BingSearchAdapter.search (integration with mocked axios)
// ---------------------------------------------------------------------------
describe('BingSearchAdapter.search', () => {
// Dynamic import so mock.module() takes effect
const createAdapter = async () => {
const { BingSearchAdapter } = await import('../adapters/bingAdapter')
return new BingSearchAdapter()
}
const SAMPLE_HTML = `
<ol id="b_results">
<li class="b_algo">
<h2><a href="https://example.com/result1">Result One</a></h2>
<p class="b_lineclamp">Snippet one</p>
</li>
<li class="b_algo">
<h2><a href="https://example.com/result2">Result Two</a></h2>
<p class="b_lineclamp">Snippet two</p>
</li>
</ol>
`
test('returns parsed results from fetched HTML', async () => {
mock.module('axios', () => ({
default: {
get: mock(() => Promise.resolve({ data: SAMPLE_HTML })),
isCancel: () => false,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const adapter = await createAdapter()
const results = await adapter.search('test query', {})
expect(results).toHaveLength(2)
expect(results[0].title).toBe('Result One')
expect(results[1].title).toBe('Result Two')
})
test('calls onProgress with query_update and search_results_received', async () => {
mock.module('axios', () => ({
default: {
get: mock(() => Promise.resolve({ data: SAMPLE_HTML })),
isCancel: () => false,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const progressCalls: any[] = []
const onProgress = (p: any) => progressCalls.push(p)
const adapter = await createAdapter()
await adapter.search('test', { onProgress })
expect(progressCalls).toHaveLength(2)
expect(progressCalls[0].type).toBe('query_update')
expect(progressCalls[0].query).toBe('test')
expect(progressCalls[1].type).toBe('search_results_received')
expect(progressCalls[1].resultCount).toBe(2)
})
test('filters results by allowedDomains', async () => {
const mixedHtml = `
<ol id="b_results">
<li class="b_algo">
<h2><a href="https://allowed.com/a">Allowed Result</a></h2>
</li>
<li class="b_algo">
<h2><a href="https://blocked.com/b">Blocked Result</a></h2>
</li>
</ol>
`
mock.module('axios', () => ({
default: {
get: mock(() => Promise.resolve({ data: mixedHtml })),
isCancel: () => false,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const adapter = await createAdapter()
const results = await adapter.search('test', {
allowedDomains: ['allowed.com'],
})
expect(results).toHaveLength(1)
expect(results[0].url).toBe('https://allowed.com/a')
})
test('filters results by blockedDomains', async () => {
const mixedHtml = `
<ol id="b_results">
<li class="b_algo">
<h2><a href="https://good.com/a">Good Result</a></h2>
</li>
<li class="b_algo">
<h2><a href="https://spam.com/b">Spam Result</a></h2>
</li>
</ol>
`
mock.module('axios', () => ({
default: {
get: mock(() => Promise.resolve({ data: mixedHtml })),
isCancel: () => false,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const adapter = await createAdapter()
const results = await adapter.search('test', {
blockedDomains: ['spam.com'],
})
expect(results).toHaveLength(1)
expect(results[0].url).toBe('https://good.com/a')
})
test('filters subdomains with allowedDomains', async () => {
const html = `
<ol id="b_results">
<li class="b_algo">
<h2><a href="https://docs.example.com/page">Subdomain Result</a></h2>
</li>
<li class="b_algo">
<h2><a href="https://other.com/page">Other Result</a></h2>
</li>
</ol>
`
mock.module('axios', () => ({
default: {
get: mock(() => Promise.resolve({ data: html })),
isCancel: () => false,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const adapter = await createAdapter()
const results = await adapter.search('test', {
allowedDomains: ['example.com'],
})
expect(results).toHaveLength(1)
expect(results[0].url).toBe('https://docs.example.com/page')
})
test('throws AbortError when signal is already aborted', async () => {
mock.module('axios', () => ({
default: {
get: mock((_url: string, config: any) => {
if (config?.signal?.aborted) {
const err = new Error('canceled')
;(err as any).__CANCEL__ = true
return Promise.reject(err)
}
return Promise.resolve({ data: SAMPLE_HTML })
}),
isCancel: (e: any) => e?.__CANCEL__ === true,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const adapter = await createAdapter()
const controller = new AbortController()
controller.abort()
const { AbortError } = await import('../../../utils/errors')
await expect(
adapter.search('test', { signal: controller.signal }),
).rejects.toThrow(AbortError)
})
test('re-throws non-abort axios errors', async () => {
const networkError = new Error('Network error')
mock.module('axios', () => ({
default: {
get: mock(() => Promise.reject(networkError)),
isCancel: () => false,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const adapter = await createAdapter()
await expect(adapter.search('test', {})).rejects.toThrow('Network error')
})
test('encodes query parameter in URL', async () => {
const axiosGet = mock(() => Promise.resolve({ data: SAMPLE_HTML }))
mock.module('axios', () => ({
default: {
get: axiosGet,
isCancel: () => false,
},
}))
mock.module('../../../utils/http', () => ({
getWebFetchUserAgent: () => 'TestAgent/1.0',
}))
const adapter = await createAdapter()
await adapter.search('hello world & special=chars', {})
const calledUrl = axiosGet.mock.calls[0][0] as string
expect(calledUrl).toContain('q=hello%20world%20%26%20special%3Dchars')
})
})

View File

@@ -0,0 +1,173 @@
/**
* API-based search adapter — delegates to Anthropic's server-side
* web_search_20250305 tool via a secondary API call.
*/
import type {
BetaContentBlock,
BetaWebSearchTool20250305,
} from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../../services/analytics/growthbook.js'
import { queryModelWithStreaming } from '../../../services/api/claude.js'
import { createUserMessage } from '../../../utils/messages.js'
import { getMainLoopModel, getSmallFastModel } from '../../../utils/model/model.js'
import { jsonParse } from '../../../utils/slowOperations.js'
import { asSystemPrompt } from '../../../utils/systemPromptType.js'
import type { SearchResult, SearchOptions, WebSearchAdapter } from './types.js'
function makeToolSchema(input: { allowedDomains?: string[]; blockedDomains?: string[] }): BetaWebSearchTool20250305 {
return {
type: 'web_search_20250305',
name: 'web_search',
allowed_domains: input.allowedDomains,
blocked_domains: input.blockedDomains,
max_uses: 8,
}
}
export class ApiSearchAdapter implements WebSearchAdapter {
async search(
query: string,
options: SearchOptions,
): Promise<SearchResult[]> {
const { signal, onProgress, allowedDomains, blockedDomains } = options
const userMessage = createUserMessage({
content: 'Perform a web search for the query: ' + query,
})
const toolSchema = makeToolSchema({ allowedDomains, blockedDomains })
const useHaiku = getFeatureValue_CACHED_MAY_BE_STALE('tengu_plum_vx3', false)
const queryStream = queryModelWithStreaming({
messages: [userMessage],
systemPrompt: asSystemPrompt([
'You are an assistant for performing a web search tool use',
]),
thinkingConfig: useHaiku
? { type: 'disabled' as const }
: { type: 'enabled' as const, budgetTokens: 10000 },
tools: [],
signal: signal ?? new AbortController().signal,
options: {
getToolPermissionContext: async () => ({
mode: 'default' as const,
additionalWorkingDirectories: new Map(),
alwaysAllowRules: {},
alwaysDenyRules: {},
alwaysAskRules: {},
isBypassPermissionsModeAvailable: false,
}),
model: useHaiku ? getSmallFastModel() : getMainLoopModel(),
toolChoice: useHaiku ? { type: 'tool' as const, name: 'web_search' } : undefined,
isNonInteractiveSession: false,
hasAppendSystemPrompt: false,
extraToolSchemas: [toolSchema],
querySource: 'web_search_tool' as const,
agents: [],
mcpTools: [],
agentId: undefined,
effortValue: undefined,
},
})
const allContentBlocks: BetaContentBlock[] = []
let currentToolUseId: string | null = null
let currentToolUseJson = ''
const toolUseQueries = new Map<string, string>()
let progressCounter = 0
for await (const event of queryStream) {
if (event.type === 'assistant') {
const msg = event as { message: { content: BetaContentBlock[] } }
allContentBlocks.push(...msg.message.content)
continue
}
if (event.type === 'stream_event') {
const streamEvt = event as {
event?: {
type: string
content_block?: { type: string; id?: string; tool_use_id?: string; content?: unknown; [key: string]: unknown }
delta?: { type: string; partial_json?: string; [key: string]: unknown }
[key: string]: unknown
}
}
if (streamEvt.event?.type === 'content_block_start') {
const contentBlock = streamEvt.event.content_block
if (contentBlock && contentBlock.type === 'server_tool_use') {
currentToolUseId = contentBlock.id as string
currentToolUseJson = ''
continue
}
}
if (currentToolUseId && streamEvt.event?.type === 'content_block_delta') {
const delta = streamEvt.event.delta
if (delta?.type === 'input_json_delta' && delta.partial_json) {
currentToolUseJson += delta.partial_json
try {
const queryMatch = currentToolUseJson.match(
/"query"\s*:\s*"((?:[^"\\]|\\.)*)"/,
)
if (queryMatch && queryMatch[1]) {
const parsedQuery = jsonParse('"' + queryMatch[1] + '"')
if (
!toolUseQueries.has(currentToolUseId) ||
toolUseQueries.get(currentToolUseId) !== parsedQuery
) {
toolUseQueries.set(currentToolUseId, parsedQuery)
progressCounter++
onProgress?.({
type: 'query_update',
query: parsedQuery,
})
}
}
} catch {
// Ignore parsing errors for partial JSON
}
}
}
if (streamEvt.event?.type === 'content_block_start') {
const contentBlock = streamEvt.event.content_block
if (contentBlock && contentBlock.type === 'web_search_tool_result') {
const toolUseId = contentBlock.tool_use_id as string
const actualQuery = toolUseQueries.get(toolUseId) || query
const content = contentBlock.content
progressCounter++
onProgress?.({
type: 'search_results_received',
resultCount: Array.isArray(content) ? content.length : 0,
query: actualQuery,
})
}
}
}
}
// Extract SearchResult[] from content blocks
return extractSearchResults(allContentBlocks)
}
}
function extractSearchResults(
blocks: BetaContentBlock[],
): SearchResult[] {
const results: SearchResult[] = []
for (const block of blocks) {
if (block.type === 'web_search_tool_result' && Array.isArray(block.content)) {
for (const r of block.content as Array<{ title: string; url: string; page_age?: string; type?: string }>) {
results.push({
title: r.title,
url: r.url,
})
}
}
}
return results
}

View File

@@ -0,0 +1,204 @@
/**
* Bing-based search adapter — fetches Bing search pages and extracts
* search results using regex pattern matching on raw HTML.
*/
import axios from 'axios'
import he from 'he'
import { AbortError } from '../../../utils/errors.js'
import type { SearchResult, SearchOptions, WebSearchAdapter } from './types.js'
const FETCH_TIMEOUT_MS = 30_000
/**
* Browser-like headers to avoid Bing's anti-bot JS-rendered response.
* These mimic Microsoft Edge on macOS to get full HTML search results.
*/
const BROWSER_HEADERS = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
Accept:
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'en-US,en;q=0.9',
'Accept-Encoding': 'gzip, deflate, br',
'Cache-Control': 'no-cache',
Pragma: 'no-cache',
'Sec-Ch-Ua': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': '"macOS"',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
} as const
export class BingSearchAdapter implements WebSearchAdapter {
async search(
query: string,
options: SearchOptions,
): Promise<SearchResult[]> {
const { signal, onProgress, allowedDomains, blockedDomains } = options
if (signal?.aborted) {
throw new AbortError()
}
onProgress?.({ type: 'query_update', query })
const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&setmkt=en-US`
const abortController = new AbortController()
if (signal) {
signal.addEventListener('abort', () => abortController.abort(), { once: true })
}
let html: string
try {
const response = await axios.get(url, {
signal: abortController.signal,
timeout: FETCH_TIMEOUT_MS,
responseType: 'text',
headers: BROWSER_HEADERS,
})
html = response.data
} catch (e) {
if (axios.isCancel(e) || abortController.signal.aborted) {
throw new AbortError()
}
throw e
}
if (abortController.signal.aborted) {
throw new AbortError()
}
const rawResults = extractBingResults(html)
// Client-side domain filtering
const results = rawResults.filter((r) => {
if (!r.url) return false
try {
const hostname = new URL(r.url).hostname
if (allowedDomains?.length && !allowedDomains.some(d => hostname === d || hostname.endsWith('.' + d))) {
return false
}
if (blockedDomains?.length && blockedDomains.some(d => hostname === d || hostname.endsWith('.' + d))) {
return false
}
} catch {
return false
}
return true
})
onProgress?.({
type: 'search_results_received',
resultCount: results.length,
query,
})
return results
}
}
/**
* Extract organic search results from Bing HTML.
* Bing results live in <li class="b_algo"> blocks within <ol id="b_results">.
*/
export function extractBingResults(html: string): SearchResult[] {
const results: SearchResult[] = []
const algoBlockRegex = /<li\s+class="b_algo"[^>]*>([\s\S]*?)<\/li>/gi
let blockMatch: RegExpExecArray | null
while ((blockMatch = algoBlockRegex.exec(html)) !== null) {
const block = blockMatch[1]
// Extract the primary link from <h2><a href="...">...</a></h2>
const h2LinkRegex = /<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i
const linkMatch = h2LinkRegex.exec(block)
if (!linkMatch) continue
const rawUrl = decodeHtmlEntities(linkMatch[1])
const titleHtml = linkMatch[2]
// Resolve Bing redirect URLs (bing.com/ck/a?...&u=a1aHR0cHM6Ly9...)
// or skip Bing-internal / relative links
const url = resolveBingUrl(rawUrl)
if (!url) continue
const title = decodeHtmlEntities(
titleHtml.replace(/<[^>]+>/g, '').trim(),
)
// Extract snippet: try b_lineclamp → b_caption <p> → b_caption fallback
const snippet = extractSnippet(block)
results.push({ title, url, snippet })
}
return results
}
function extractSnippet(block: string): string | undefined {
// 1. Try <p class="b_lineclamp...">
const lineclampRegex = /<p[^>]*class="b_lineclamp[^"]*"[^>]*>([\s\S]*?)<\/p>/i
let match = lineclampRegex.exec(block)
if (match) {
return decodeHtmlEntities(match[1].replace(/<[^>]+>/g, '').trim())
}
// 2. Try <p> inside b_caption
const captionPRegex = /<div[^>]*class="b_caption[^"]*"[^>]*>[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/i
match = captionPRegex.exec(block)
if (match) {
return decodeHtmlEntities(match[1].replace(/<[^>]+>/g, '').trim())
}
// 3. Fallback: any text inside b_caption <div>
const fallbackRegex = /<div[^>]*class="b_caption[^"]*"[^>]*>([\s\S]*?)<\/div>/i
const fallbackMatch = fallbackRegex.exec(block)
if (fallbackMatch) {
const text = fallbackMatch[1].replace(/<[^>]+>/g, '').trim()
if (text) return decodeHtmlEntities(text)
}
return undefined
}
export const decodeHtmlEntities = he.decode
/**
* Resolve a Bing redirect URL to the actual target URL.
* Bing uses URLs like: https://www.bing.com/ck/a?...&u=a1aHR0cHM6Ly9leGFtcGxlLmNvbQ...
* The `u` query parameter is a base64-encoded URL prefixed with a1 (https) or a0 (http).
* Returns `undefined` for Bing-internal or relative links that should be skipped.
*/
export function resolveBingUrl(rawUrl: string): string | undefined {
// Skip relative / anchor links
if (rawUrl.startsWith('/') || rawUrl.startsWith('#')) return undefined
// Try to extract the `u` parameter from Bing redirect URLs
const uMatch = rawUrl.match(/[?&]u=([a-zA-Z0-9+/_=-]+)/)
if (uMatch) {
const encoded = uMatch[1]
if (encoded.length >= 3) {
const prefix = encoded.slice(0, 2)
const b64 = encoded.slice(2)
try {
// Base64url decode (pad as needed)
const padded = b64.replace(/-/g, '+').replace(/_/g, '/')
const decoded = Buffer.from(padded, 'base64').toString('utf-8')
if (decoded.startsWith('http')) return decoded
} catch {
// Fall through — not a valid base64 redirect
}
}
}
// Direct external URL (not a Bing-internal page)
if (!rawUrl.includes('bing.com')) return rawUrl
return undefined
}

View File

@@ -0,0 +1,41 @@
/**
* Search adapter factory — selects the appropriate backend by checking
* whether the API base URL points to Anthropic's official endpoint.
*/
import { isFirstPartyAnthropicBaseUrl } from '../../../utils/model/providers.js'
import { ApiSearchAdapter } from './apiAdapter.js'
import { BingSearchAdapter } from './bingAdapter.js'
import type { WebSearchAdapter } from './types.js'
export type { SearchResult, SearchOptions, SearchProgress, WebSearchAdapter } from './types.js'
let cachedAdapter: WebSearchAdapter | null = null
export function createAdapter(): WebSearchAdapter {
// 直接用 bing 适配器,跳过 API 适配器的选择逻辑
return new BingSearchAdapter()
// // Adapter is stateless — safe to reuse across calls within a session
// if (cachedAdapter) return cachedAdapter
// // Env override: WEB_SEARCH_ADAPTER=api|bing forces specific backend
// const envAdapter = process.env.WEB_SEARCH_ADAPTER
// if (envAdapter === 'api') {
// cachedAdapter = new ApiSearchAdapter()
// return cachedAdapter
// }
// if (envAdapter === 'bing') {
// cachedAdapter = new BingSearchAdapter()
// return cachedAdapter
// }
// // Anthropic official URL → API server-side search
// if (isFirstPartyAnthropicBaseUrl()) {
// cachedAdapter = new ApiSearchAdapter()
// return cachedAdapter
// }
// // Third-party proxies / non-Anthropic endpoints → Bing fallback
// cachedAdapter = new BingSearchAdapter()
// return cachedAdapter
}

View File

@@ -0,0 +1,22 @@
export interface SearchResult {
title: string
url: string
snippet?: string
}
export interface SearchOptions {
allowedDomains?: string[]
blockedDomains?: string[]
signal?: AbortSignal
onProgress?: (progress: SearchProgress) => void
}
export interface SearchProgress {
type: 'query_update' | 'search_results_received'
query?: string
resultCount?: number
}
export interface WebSearchAdapter {
search(query: string, options: SearchOptions): Promise<SearchResult[]>
}