` — 普通摘要段落
+3. `
` 的直接文本内容 — 兜底方案
+
+| 特性 | 实现 |
+|------|------|
+| **超时** | 30 秒(`FETCH_TIMEOUT_MS`) |
+| **域过滤** | 支持 `allowedDomains` / `blockedDomains`,含子域名匹配 |
+| **进度追踪** | 发送 query_update 和 search_results_received 回调 |
+| **中止支持** | 外部 AbortSignal 传播到 axios 请求 |
+
+### WebSearchTool 统一接口
+
+`WebSearchTool`(`src/tools/WebSearchTool/WebSearchTool.ts`)是面向主循环的工具定义,所有 provider 均可使用(`isEnabled()` 始终返回 true)。它将适配器返回的 `SearchResult[]` 转换为内部 `Output` 格式,`mapToolResultToToolResultBlockParam` 将搜索结果格式化为带 markdown 超链接的文本,并附加 "REMINDER" 要求主模型在回复中包含 Sources。
+
+### WebFetch 实现机制
+
+WebFetch 是一个完整的 HTTP 客户端 + 内容处理管线:
+
+```
+调用链:
+ WebFetchTool.call({ url, prompt })
+ → getURLMarkdownContent(url)
+ → validateURL() — 长度≤2000、无用户名密码、公网域名
+ → URL_CACHE 命中检查(15 分钟 TTL LRU,50MB 上限)
+ → checkDomainBlocklist() — 调用 api.anthropic.com/api/web/domain_info 预检
+ → getWithPermittedRedirects() — axios 请求,自定义重定向处理
+ → HTML → Turndown 转 Markdown(懒加载单例,~1.4MB)
+ → 非 HTML → 原始文本
+ → 二进制(PDF 等)→ persistBinaryContent() 保存到磁盘
+ → applyPromptToMarkdown()
+ → 截断到 100K 字符
+ → queryHaiku() 用小模型按 prompt 提取信息
+ → 返回处理后的结果
+```
+
+安全防护多层设计:
+
+| 层级 | 机制 | 说明 |
+|------|------|------|
+| **域名预检** | `checkDomainBlocklist()` | 调用 `api.anthropic.com/api/web/domain_info?domain=…`,5 分钟缓存 |
+| **重定向控制** | `isPermittedRedirect()` | 仅允许同 host(±www)重定向,跨域重定向返回提示让 AI 重新调用 |
+| **重定向深度** | `MAX_REDIRECTS = 10` | 防止重定向循环无限挂起 |
+| **内容大小** | `MAX_HTTP_CONTENT_LENGTH = 10MB` | 单次响应上限 |
+| **请求超时** | `FETCH_TIMEOUT_MS = 60s` | 主请求超时;域名预检 10s |
+| **URL 验证** | `validateURL()` | 长度、协议、用户名密码、公网域名检查 |
+| **egress 检测** | `X-Proxy-Error: blocked-by-allowlist` | 检测企业代理拦截 |
+
+预批准域名(`src/tools/WebFetchTool/preapproved.ts`):
+
+用户无需手动授权即可抓取的域名列表,包含 ~90 个主流技术文档站点(MDN、Python docs、React docs、AWS docs 等)。列表分为 hostname-only 和 path-prefix 两类,查找复杂度 O(1)。
+
+对预批准域名,WebFetch 跳过 Haiku 摘要步骤(如果内容是 Markdown 且 < 100K 字符),直接返回原文——因为技术文档本身的结构化程度已经足够好。
+
+权限模型方面,WebFetch 按 hostname 生成 `domain:xxx` 规则匹配用户的 allow/deny/ask 规则,支持用户对特定域名配置永久允许或拒绝。
+
### ripgrep 的流式输出
对于交互式场景(如 QuickOpen),ripgrep 支持**流式输出**(`ripGrepStream()`):
diff --git a/package.json b/package.json
index 6bb3868ff..083a4b710 100644
--- a/package.json
+++ b/package.json
@@ -67,6 +67,7 @@
"@aws-sdk/credential-provider-node": "^3.972.28",
"@aws-sdk/credential-providers": "^3.1020.0",
"@azure/identity": "^4.13.1",
+ "@biomejs/biome": "^2.4.10",
"@commander-js/extra-typings": "^14.0.0",
"@growthbook/growthbook": "^1.6.5",
"@modelcontextprotocol/sdk": "^1.29.0",
@@ -90,6 +91,13 @@
"@opentelemetry/semantic-conventions": "^1.40.0",
"@smithy/core": "^3.23.13",
"@smithy/node-http-handler": "^4.5.1",
+ "@types/bun": "^1.3.11",
+ "@types/cacache": "^20.0.1",
+ "@types/plist": "^3.0.5",
+ "@types/react": "^19.2.14",
+ "@types/react-reconciler": "^0.33.0",
+ "@types/sharp": "^0.32.0",
+ "@types/turndown": "^5.0.6",
"ajv": "^8.18.0",
"asciichart": "^1.5.25",
"audio-capture-napi": "workspace:*",
@@ -112,12 +120,14 @@
"fuse.js": "^7.1.0",
"get-east-asian-width": "^1.5.0",
"google-auth-library": "^10.6.2",
+ "he": "^1.2.0",
"highlight.js": "^11.11.1",
"https-proxy-agent": "^8.0.0",
"ignore": "^7.0.5",
"image-processor-napi": "workspace:*",
"indent-string": "^5.0.0",
"jsonc-parser": "^3.3.1",
+ "knip": "^6.1.1",
"lodash-es": "^4.17.23",
"lru-cache": "^11.2.7",
"marked": "^17.0.5",
@@ -140,6 +150,7 @@
"tree-kill": "^1.2.2",
"turndown": "^7.2.2",
"type-fest": "^5.5.0",
+ "typescript": "^6.0.2",
"undici": "^7.24.6",
"url-handler-napi": "workspace:*",
"usehooks-ts": "^3.1.1",
@@ -150,16 +161,6 @@
"ws": "^8.20.0",
"xss": "^1.0.15",
"yaml": "^2.8.3",
- "zod": "^4.3.6",
- "@biomejs/biome": "^2.4.10",
- "@types/bun": "^1.3.11",
- "@types/cacache": "^20.0.1",
- "@types/plist": "^3.0.5",
- "@types/react": "^19.2.14",
- "@types/react-reconciler": "^0.33.0",
- "@types/sharp": "^0.32.0",
- "@types/turndown": "^5.0.6",
- "knip": "^6.1.1",
- "typescript": "^6.0.2"
+ "zod": "^4.3.6"
}
}
diff --git a/src/tools/WebFetchTool/utils.ts b/src/tools/WebFetchTool/utils.ts
index a805792b0..f75e358b4 100644
--- a/src/tools/WebFetchTool/utils.ts
+++ b/src/tools/WebFetchTool/utils.ts
@@ -384,7 +384,7 @@ export async function getURLMarkdownContent(
// This is for enterprise customers with restrictive security policies
// that prevent outbound connections to claude.ai
const settings = getSettings_DEPRECATED()
- if (!settings.skipWebFetchPreflight) {
+ if (settings.skipWebFetchPreflight === false) {
const checkResult = await checkDomainBlocklist(hostname)
switch (checkResult.status) {
case 'allowed':
diff --git a/src/tools/WebSearchTool/WebSearchTool.ts b/src/tools/WebSearchTool/WebSearchTool.ts
index 8afeab554..77fba39f5 100644
--- a/src/tools/WebSearchTool/WebSearchTool.ts
+++ b/src/tools/WebSearchTool/WebSearchTool.ts
@@ -1,19 +1,9 @@
-import type {
- BetaContentBlock,
- BetaWebSearchTool20250305,
-} from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
-import { getAPIProvider } from 'src/utils/model/providers.js'
import type { PermissionResult } from 'src/utils/permissions/PermissionResult.js'
import { z } from 'zod/v4'
-import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
-import { queryModelWithStreaming } from '../../services/api/claude.js'
import { buildTool, type ToolDef } from '../../Tool.js'
import { lazySchema } from '../../utils/lazySchema.js'
-import { logError } from '../../utils/log.js'
-import { createUserMessage } from '../../utils/messages.js'
-import { getMainLoopModel, getSmallFastModel } from '../../utils/model/model.js'
-import { jsonParse, jsonStringify } from '../../utils/slowOperations.js'
-import { asSystemPrompt } from '../../utils/systemPromptType.js'
+import { jsonStringify } from '../../utils/slowOperations.js'
+import { createAdapter } from './adapters/index.js'
import { getWebSearchPrompt, WEB_SEARCH_TOOL_NAME } from './prompt.js'
import {
getToolUseSummary,
@@ -37,12 +27,11 @@ const inputSchema = lazySchema(() =>
)
type InputSchema = ReturnType
-type Input = z.infer
-
const searchResultSchema = lazySchema(() => {
const searchHitSchema = z.object({
title: z.string().describe('The title of the search result'),
url: z.string().describe('The URL of the search result'),
+ snippet: z.string().optional().describe('A short description of the search result'),
})
return z.object({
@@ -73,82 +62,6 @@ export type { WebSearchProgress } from '../../types/tools.js'
import type { WebSearchProgress } from '../../types/tools.js'
-function makeToolSchema(input: Input): BetaWebSearchTool20250305 {
- return {
- type: 'web_search_20250305',
- name: 'web_search',
- allowed_domains: input.allowed_domains,
- blocked_domains: input.blocked_domains,
- max_uses: 8, // Hardcoded to 8 searches maximum
- }
-}
-
-function makeOutputFromSearchResponse(
- result: BetaContentBlock[],
- query: string,
- durationSeconds: number,
-): Output {
- // The result is a sequence of these blocks:
- // - text to start -- always?
- // [
- // - server_tool_use
- // - web_search_tool_result
- // - text and citation blocks intermingled
- // ]+ (this block repeated for each search)
-
- const results: (SearchResult | string)[] = []
- let textAcc = ''
- let inText = true
-
- for (const block of result) {
- if (block.type === 'server_tool_use') {
- if (inText) {
- inText = false
- if (textAcc.trim().length > 0) {
- results.push(textAcc.trim())
- }
- textAcc = ''
- }
- continue
- }
-
- if (block.type === 'web_search_tool_result') {
- // Handle error case - content is a WebSearchToolResultError
- if (!Array.isArray(block.content)) {
- const errorMessage = `Web search error: ${block.content.error_code}`
- logError(new Error(errorMessage))
- results.push(errorMessage)
- continue
- }
- // Success case - add results to our collection
- const hits = block.content.map(r => ({ title: r.title, url: r.url }))
- results.push({
- tool_use_id: block.tool_use_id,
- content: hits,
- })
- }
-
- if (block.type === 'text') {
- if (inText) {
- textAcc += block.text
- } else {
- inText = true
- textAcc = block.text
- }
- }
- }
-
- if (textAcc.length) {
- results.push(textAcc.trim())
- }
-
- return {
- query,
- results,
- durationSeconds,
- }
-}
-
export const WebSearchTool = buildTool({
name: WEB_SEARCH_TOOL_NAME,
searchHint: 'search the web for current information',
@@ -166,30 +79,9 @@ export const WebSearchTool = buildTool({
return summary ? `Searching for ${summary}` : 'Searching the web'
},
isEnabled() {
- const provider = getAPIProvider()
- const model = getMainLoopModel()
-
- // Enable for firstParty
- if (provider === 'firstParty') {
- return true
- }
-
- // Enable for Vertex AI with supported models (Claude 4.0+)
- if (provider === 'vertex') {
- const supportsWebSearch =
- model.includes('claude-opus-4') ||
- model.includes('claude-sonnet-4') ||
- model.includes('claude-haiku-4')
-
- return supportsWebSearch
- }
-
- // Foundry only ships models that already support Web Search
- if (provider === 'foundry') {
- return true
- }
-
- return false
+ // Always enabled — the adapter factory selects the appropriate backend
+ // (API server-side search or Bing fallback) based on provider capabilities.
+ return true
},
get inputSchema(): InputSchema {
return inputSchema()
@@ -227,9 +119,6 @@ export const WebSearchTool = buildTool({
renderToolUseProgressMessage,
renderToolResultMessage,
extractSearchText() {
- // renderToolResultMessage shows only "Did N searches in Xs" chrome —
- // the results[] content never appears on screen. Heuristic would index
- // string entries in results[] (phantom match). Nothing to search.
return ''
},
async validateInput(input) {
@@ -254,149 +143,42 @@ export const WebSearchTool = buildTool({
async call(input, context, _canUseTool, _parentMessage, onProgress) {
const startTime = performance.now()
const { query } = input
- const userMessage = createUserMessage({
- content: 'Perform a web search for the query: ' + query,
- })
- const toolSchema = makeToolSchema(input)
- const useHaiku = getFeatureValue_CACHED_MAY_BE_STALE(
- 'tengu_plum_vx3',
- false,
- )
-
- const appState = context.getAppState()
- const queryStream = queryModelWithStreaming({
- messages: [userMessage],
- systemPrompt: asSystemPrompt([
- 'You are an assistant for performing a web search tool use',
- ]),
- thinkingConfig: useHaiku
- ? { type: 'disabled' as const }
- : context.options.thinkingConfig,
- tools: [],
+ const adapter = createAdapter()
+ const adapterResults = await adapter.search(query, {
+ allowedDomains: input.allowed_domains,
+ blockedDomains: input.blocked_domains,
signal: context.abortController.signal,
- options: {
- getToolPermissionContext: async () => appState.toolPermissionContext,
- model: useHaiku ? getSmallFastModel() : context.options.mainLoopModel,
- toolChoice: useHaiku ? { type: 'tool', name: 'web_search' } : undefined,
- isNonInteractiveSession: context.options.isNonInteractiveSession,
- hasAppendSystemPrompt: !!context.options.appendSystemPrompt,
- extraToolSchemas: [toolSchema],
- querySource: 'web_search_tool',
- agents: context.options.agentDefinitions.activeAgents,
- mcpTools: [],
- agentId: context.agentId,
- effortValue: appState.effortValue,
+ onProgress(progress) {
+ if (onProgress) {
+ const progressCounter = Date.now()
+ onProgress({
+ toolUseID: `search-progress-${progressCounter}`,
+ data: progress,
+ })
+ }
},
})
- const allContentBlocks: BetaContentBlock[] = []
- let currentToolUseId = null
- let currentToolUseJson = ''
- let progressCounter = 0
- const toolUseQueries = new Map() // Map of tool_use_id to query
-
- for await (const event of queryStream) {
- if (event.type === 'assistant') {
- const msg = event as { message: { content: BetaContentBlock[] } }
- allContentBlocks.push(...msg.message.content)
- continue
- }
-
- // Track tool use ID when server_tool_use starts
- if (
- event.type === 'stream_event'
- ) {
- const streamEvt = event as { event?: { type: string; content_block?: { type: string; id?: string; tool_use_id?: string; content?: unknown; [key: string]: unknown }; delta?: { type: string; partial_json?: string; [key: string]: unknown }; [key: string]: unknown } }
- if (streamEvt.event?.type === 'content_block_start') {
- const contentBlock = streamEvt.event.content_block
- if (contentBlock && contentBlock.type === 'server_tool_use') {
- currentToolUseId = contentBlock.id as string
- currentToolUseJson = ''
- // Note: The ServerToolUseBlock doesn't contain input.query
- // The actual query comes through input_json_delta events
- continue
- }
- }
-
- // Accumulate JSON for current tool use
- if (
- currentToolUseId &&
- streamEvt.event?.type === 'content_block_delta'
- ) {
- const delta = streamEvt.event.delta
- if (delta?.type === 'input_json_delta' && delta.partial_json) {
- currentToolUseJson += delta.partial_json
-
- // Try to extract query from partial JSON for progress updates
- try {
- // Look for a complete query field
- const queryMatch = currentToolUseJson.match(
- /"query"\s*:\s*"((?:[^"\\]|\\.)*)"/,
- )
- if (queryMatch && queryMatch[1]) {
- // The regex properly handles escaped characters
- const query = jsonParse('"' + queryMatch[1] + '"')
-
- if (
- !toolUseQueries.has(currentToolUseId) ||
- toolUseQueries.get(currentToolUseId) !== query
- ) {
- toolUseQueries.set(currentToolUseId, query)
- progressCounter++
- if (onProgress) {
- onProgress({
- toolUseID: `search-progress-${progressCounter}`,
- data: {
- type: 'query_update',
- query,
- },
- })
- }
- }
- }
- } catch {
- // Ignore parsing errors for partial JSON
- }
- }
- }
-
- // Yield progress when search results come in
- if (
- streamEvt.event?.type === 'content_block_start'
- ) {
- const contentBlock = streamEvt.event.content_block
- if (contentBlock && contentBlock.type === 'web_search_tool_result') {
- // Get the actual query that was used for this search
- const toolUseId = contentBlock.tool_use_id
- const actualQuery = toolUseQueries.get(toolUseId) || query
- const content = contentBlock.content
-
- progressCounter++
- if (onProgress) {
- onProgress({
- toolUseID: toolUseId || `search-progress-${progressCounter}`,
- data: {
- type: 'search_results_received',
- resultCount: Array.isArray(content) ? content.length : 0,
- query: actualQuery,
- },
- })
- }
- }
- }
- } // end stream_event
- }
-
- // Process the final result
const endTime = performance.now()
const durationSeconds = (endTime - startTime) / 1000
- const data = makeOutputFromSearchResponse(
- allContentBlocks,
+ // Convert adapter SearchResult[] to legacy Output format
+ const results: (SearchResult | string)[] = []
+ if (adapterResults.length > 0) {
+ results.push({
+ tool_use_id: 'adapter-search-1',
+ content: adapterResults.map(r => ({ title: r.title, url: r.url, snippet: r.snippet })),
+ })
+ } else {
+ results.push('No search results found.')
+ }
+
+ const data: Output = {
query,
+ results,
durationSeconds,
- )
+ }
return { data }
},
mapToolResultToToolResultBlockParam(output, toolUseID) {
@@ -404,20 +186,23 @@ export const WebSearchTool = buildTool({
let formattedOutput = `Web search results for query: "${query}"\n\n`
- // Process the results array - it can contain both string summaries and search result objects.
- // Guard against null/undefined entries that can appear after JSON round-tripping
- // (e.g., from compaction or transcript deserialization).
;(results ?? []).forEach(result => {
if (result == null) {
return
}
if (typeof result === 'string') {
- // Text summary
formattedOutput += result + '\n\n'
} else {
- // Search result with links
if (result.content?.length > 0) {
- formattedOutput += `Links: ${jsonStringify(result.content)}\n\n`
+ formattedOutput += 'Links:\n'
+ for (const link of result.content) {
+ formattedOutput += ` - [${link.title}](${link.url})`
+ if (link.snippet) {
+ formattedOutput += `: ${link.snippet}`
+ }
+ formattedOutput += '\n'
+ }
+ formattedOutput += '\n'
} else {
formattedOutput += 'No links found.\n\n'
}
diff --git a/src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts b/src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts
new file mode 100644
index 000000000..313a973d7
--- /dev/null
+++ b/src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts
@@ -0,0 +1,82 @@
+/**
+ * Integration test for BingSearchAdapter — hits the real Bing search.
+ *
+ * Usage:
+ * bun run src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts
+ *
+ * Optional env vars:
+ * BING_QUERY — search query (default: "Claude AI Anthropic")
+ */
+
+// Provide MACRO globals needed by the codebase when running outside dev mode
+if (!globalThis.MACRO) {
+ globalThis.MACRO = { VERSION: '0.0.0-test', BUILD_TIME: '0' } as any
+}
+
+import { BingSearchAdapter, extractBingResults } from '../adapters/bingAdapter'
+
+const query = process.env.BING_QUERY || 'Claude AI Anthropic'
+
+async function main() {
+ console.log(`\n🔍 Searching Bing for: "${query}"\n`)
+
+ const adapter = new BingSearchAdapter()
+ const startTime = Date.now()
+
+ const results = await adapter.search(query, {
+ onProgress: (p) => {
+ if (p.type === 'query_update') {
+ console.log(` → Query sent: ${p.query}`)
+ }
+ if (p.type === 'search_results_received') {
+ console.log(` → Received ${p.resultCount} results`)
+ }
+ },
+ })
+
+ const elapsed = Date.now() - startTime
+ console.log(`\n✅ Done in ${elapsed}ms — ${results.length} result(s)\n`)
+
+ if (results.length === 0) {
+ console.log('⚠️ No results returned. Possible causes:')
+ console.log(' - Bing returned a CAPTCHA or rate-limited the request')
+ console.log(' - Network/firewall issue')
+ console.log(' - Bing HTML structure changed')
+ console.log(' - Anti-bot detection triggered\n')
+ process.exit(1)
+ }
+
+ for (const [i, r] of results.entries()) {
+ console.log(` ${i + 1}. ${r.title}`)
+ console.log(` ${r.url}`)
+ if (r.snippet) {
+ const snippet = r.snippet.replace(/\n/g, ' ')
+ console.log(` ${snippet.slice(0, 150)}${snippet.length > 150 ? '…' : ''}`)
+ }
+ console.log()
+ }
+
+ // Validate result structure
+ let passed = true
+ for (const [i, r] of results.entries()) {
+ if (!r.title || typeof r.title !== 'string') {
+ console.error(`❌ Result ${i + 1}: missing or non-string title`, r)
+ passed = false
+ }
+ if (!r.url || !r.url.startsWith('http')) {
+ console.error(`❌ Result ${i + 1}: missing or non-http url`, r)
+ passed = false
+ }
+ }
+
+ if (passed) {
+ console.log('✅ All results have valid structure.\n')
+ } else {
+ process.exit(1)
+ }
+}
+
+main().catch((e) => {
+ console.error('❌ Fatal error:', e)
+ process.exit(1)
+})
diff --git a/src/tools/WebSearchTool/__tests__/bingAdapter.test.ts b/src/tools/WebSearchTool/__tests__/bingAdapter.test.ts
new file mode 100644
index 000000000..02a215966
--- /dev/null
+++ b/src/tools/WebSearchTool/__tests__/bingAdapter.test.ts
@@ -0,0 +1,499 @@
+import { describe, expect, mock, test } from 'bun:test'
+import { extractBingResults, decodeHtmlEntities } from '../adapters/bingAdapter'
+
+// ---------------------------------------------------------------------------
+// decodeHtmlEntities
+// ---------------------------------------------------------------------------
+
+describe('decodeHtmlEntities', () => {
+ test('decodes common named entities', () => {
+ expect(decodeHtmlEntities('& < >')).toBe('& < >')
+ })
+
+ test('decodes quote entities', () => {
+ expect(decodeHtmlEntities('"hello"')).toBe('"hello"')
+ })
+
+ test('decodes numeric and hex apostrophe entities', () => {
+ expect(decodeHtmlEntities(''it's')).toBe("'it's")
+ })
+
+ test('decodes to non-breaking space (\\u00A0)', () => {
+ expect(decodeHtmlEntities('a b')).toBe('a\u00A0b')
+ })
+
+ test('returns plain text unchanged', () => {
+ expect(decodeHtmlEntities('hello world')).toBe('hello world')
+ })
+
+ test('handles empty string', () => {
+ expect(decodeHtmlEntities('')).toBe('')
+ })
+
+ test('decodes multiple occurrences of the same entity', () => {
+ expect(decodeHtmlEntities('a&b&c')).toBe('a&b&c')
+ })
+
+ test('handles mixed entities in one string', () => {
+ expect(decodeHtmlEntities('<a href="x">')).toBe('')
+ })
+})
+
+// ---------------------------------------------------------------------------
+// extractBingResults
+// ---------------------------------------------------------------------------
+
+describe('extractBingResults', () => {
+ test('extracts results from standard Bing HTML', () => {
+ const html = `
+
+ -
+
+
+
First result description
+
+
+ -
+
+
+
Second result description
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results).toHaveLength(2)
+ expect(results[0]).toEqual({
+ title: 'Example Title 1',
+ url: 'https://example.com/page1',
+ snippet: 'First result description',
+ })
+ expect(results[1]).toEqual({
+ title: 'Example Title 2',
+ url: 'https://example.com/page2',
+ snippet: 'Second result description',
+ })
+ })
+
+ test('returns empty array when no b_algo blocks exist', () => {
+ const html = `
+
+ - Ad result
+ - Answer card
+
+ `
+ expect(extractBingResults(html)).toEqual([])
+ })
+
+ test('returns empty array for empty HTML', () => {
+ expect(extractBingResults('')).toEqual([])
+ })
+
+ test('returns empty array for unrelated HTML', () => {
+ expect(extractBingResults('Hello')).toEqual([])
+ })
+
+ test('skips Bing-internal links', () => {
+ const html = `
+
+
+
+
+
+
+
+
+
+ `
+ expect(extractBingResults(html)).toEqual([])
+ })
+
+ test('strips HTML tags from titles', () => {
+ const html = `
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results).toHaveLength(1)
+ expect(results[0].title).toBe('Result with bold and italic')
+ })
+
+ test('decodes HTML entities in titles', () => {
+ const html = `
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results[0].title).toBe('Tom & Jerry ')
+ })
+
+ test('extracts snippet from b_lineclamp class', () => {
+ const html = `
+
+
+ Lineclamp snippet text here
+
+ `
+ const results = extractBingResults(html)
+ expect(results[0].snippet).toBe('Lineclamp snippet text here')
+ })
+
+ test('extracts snippet from b_caption paragraph fallback', () => {
+ const html = `
+
+
+
+
Caption paragraph text
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results[0].snippet).toBe('Caption paragraph text')
+ })
+
+ test('extracts snippet from b_caption div fallback', () => {
+ const html = `
+
+
+ Direct caption text without p tag
+
+ `
+ const results = extractBingResults(html)
+ expect(results[0].snippet).toBe('Direct caption text without p tag')
+ })
+
+ test('returns undefined snippet when no caption exists', () => {
+ const html = `
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results[0].snippet).toBeUndefined()
+ })
+
+ test('handles mixed result types and only extracts b_algo', () => {
+ const html = `
+
+
+ -
+
+
A real snippet
+
+ People also ask
+ -
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results).toHaveLength(2)
+ expect(results[0].title).toBe('Real Result')
+ expect(results[1].title).toBe('Another Result')
+ })
+
+ test('skips b_algo blocks without h2 > a structure', () => {
+ const html = `
+
+ No link here
+
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results).toHaveLength(1)
+ expect(results[0].title).toBe('Valid Result')
+ })
+
+ test('handles extra whitespace in h2 > a structure', () => {
+ const html = `
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results).toHaveLength(1)
+ expect(results[0].title).toBe('Whitespace Title')
+ })
+
+ test('handles snippet with HTML entities', () => {
+ const html = `
+
+
+ 5 < 10 & 10 > 5
+
+ `
+ const results = extractBingResults(html)
+ expect(results[0].snippet).toBe('5 < 10 & 10 > 5')
+ })
+
+ test('handles real-world Bing HTML structure', () => {
+ const html = `
+
+ -
+
+
+
+ https://docs.python.org
+
+
+ Welcome to the Python Tutorial. This tutorial introduces you to the basic concepts and features...
+
+
+
+ -
+
+
+
+ https://realpython.com
+
+
+ The ultimate Python guide for beginners and experts alike.
+
+
+
+
+ `
+ const results = extractBingResults(html)
+ expect(results).toHaveLength(2)
+ expect(results[0].title).toBe('Python Tutorial')
+ expect(results[0].url).toBe('https://docs.python.org/3/tutorial/index.html')
+ expect(results[0].snippet).toContain('Welcome to the Python Tutorial')
+ expect(results[1].title).toBe('Real Python Guide')
+ expect(results[1].snippet).toContain('ultimate Python guide')
+ })
+})
+
+// ---------------------------------------------------------------------------
+// BingSearchAdapter.search (integration with mocked axios)
+// ---------------------------------------------------------------------------
+
+describe('BingSearchAdapter.search', () => {
+ // Dynamic import so mock.module() takes effect
+ const createAdapter = async () => {
+ const { BingSearchAdapter } = await import('../adapters/bingAdapter')
+ return new BingSearchAdapter()
+ }
+
+ const SAMPLE_HTML = `
+
+ -
+
+
Snippet one
+
+ -
+
+
Snippet two
+
+
+ `
+
+ test('returns parsed results from fetched HTML', async () => {
+ mock.module('axios', () => ({
+ default: {
+ get: mock(() => Promise.resolve({ data: SAMPLE_HTML })),
+ isCancel: () => false,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const adapter = await createAdapter()
+ const results = await adapter.search('test query', {})
+ expect(results).toHaveLength(2)
+ expect(results[0].title).toBe('Result One')
+ expect(results[1].title).toBe('Result Two')
+ })
+
+ test('calls onProgress with query_update and search_results_received', async () => {
+ mock.module('axios', () => ({
+ default: {
+ get: mock(() => Promise.resolve({ data: SAMPLE_HTML })),
+ isCancel: () => false,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const progressCalls: any[] = []
+ const onProgress = (p: any) => progressCalls.push(p)
+
+ const adapter = await createAdapter()
+ await adapter.search('test', { onProgress })
+
+ expect(progressCalls).toHaveLength(2)
+ expect(progressCalls[0].type).toBe('query_update')
+ expect(progressCalls[0].query).toBe('test')
+ expect(progressCalls[1].type).toBe('search_results_received')
+ expect(progressCalls[1].resultCount).toBe(2)
+ })
+
+ test('filters results by allowedDomains', async () => {
+ const mixedHtml = `
+
+ -
+
+
+ -
+
+
+
+ `
+ mock.module('axios', () => ({
+ default: {
+ get: mock(() => Promise.resolve({ data: mixedHtml })),
+ isCancel: () => false,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const adapter = await createAdapter()
+ const results = await adapter.search('test', {
+ allowedDomains: ['allowed.com'],
+ })
+ expect(results).toHaveLength(1)
+ expect(results[0].url).toBe('https://allowed.com/a')
+ })
+
+ test('filters results by blockedDomains', async () => {
+ const mixedHtml = `
+
+ -
+
+
+ -
+
+
+
+ `
+ mock.module('axios', () => ({
+ default: {
+ get: mock(() => Promise.resolve({ data: mixedHtml })),
+ isCancel: () => false,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const adapter = await createAdapter()
+ const results = await adapter.search('test', {
+ blockedDomains: ['spam.com'],
+ })
+ expect(results).toHaveLength(1)
+ expect(results[0].url).toBe('https://good.com/a')
+ })
+
+ test('filters subdomains with allowedDomains', async () => {
+ const html = `
+
+ -
+
+
+ -
+
+
+
+ `
+ mock.module('axios', () => ({
+ default: {
+ get: mock(() => Promise.resolve({ data: html })),
+ isCancel: () => false,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const adapter = await createAdapter()
+ const results = await adapter.search('test', {
+ allowedDomains: ['example.com'],
+ })
+ expect(results).toHaveLength(1)
+ expect(results[0].url).toBe('https://docs.example.com/page')
+ })
+
+ test('throws AbortError when signal is already aborted', async () => {
+ mock.module('axios', () => ({
+ default: {
+ get: mock((_url: string, config: any) => {
+ if (config?.signal?.aborted) {
+ const err = new Error('canceled')
+ ;(err as any).__CANCEL__ = true
+ return Promise.reject(err)
+ }
+ return Promise.resolve({ data: SAMPLE_HTML })
+ }),
+ isCancel: (e: any) => e?.__CANCEL__ === true,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const adapter = await createAdapter()
+ const controller = new AbortController()
+ controller.abort()
+
+ const { AbortError } = await import('../../../utils/errors')
+ await expect(
+ adapter.search('test', { signal: controller.signal }),
+ ).rejects.toThrow(AbortError)
+ })
+
+ test('re-throws non-abort axios errors', async () => {
+ const networkError = new Error('Network error')
+ mock.module('axios', () => ({
+ default: {
+ get: mock(() => Promise.reject(networkError)),
+ isCancel: () => false,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const adapter = await createAdapter()
+ await expect(adapter.search('test', {})).rejects.toThrow('Network error')
+ })
+
+ test('encodes query parameter in URL', async () => {
+ const axiosGet = mock(() => Promise.resolve({ data: SAMPLE_HTML }))
+ mock.module('axios', () => ({
+ default: {
+ get: axiosGet,
+ isCancel: () => false,
+ },
+ }))
+ mock.module('../../../utils/http', () => ({
+ getWebFetchUserAgent: () => 'TestAgent/1.0',
+ }))
+
+ const adapter = await createAdapter()
+ await adapter.search('hello world & special=chars', {})
+
+ const calledUrl = axiosGet.mock.calls[0][0] as string
+ expect(calledUrl).toContain('q=hello%20world%20%26%20special%3Dchars')
+ })
+})
diff --git a/src/tools/WebSearchTool/adapters/apiAdapter.ts b/src/tools/WebSearchTool/adapters/apiAdapter.ts
new file mode 100644
index 000000000..ab78c4b21
--- /dev/null
+++ b/src/tools/WebSearchTool/adapters/apiAdapter.ts
@@ -0,0 +1,173 @@
+/**
+ * API-based search adapter — delegates to Anthropic's server-side
+ * web_search_20250305 tool via a secondary API call.
+ */
+
+import type {
+ BetaContentBlock,
+ BetaWebSearchTool20250305,
+} from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
+import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../../services/analytics/growthbook.js'
+import { queryModelWithStreaming } from '../../../services/api/claude.js'
+import { createUserMessage } from '../../../utils/messages.js'
+import { getMainLoopModel, getSmallFastModel } from '../../../utils/model/model.js'
+import { jsonParse } from '../../../utils/slowOperations.js'
+import { asSystemPrompt } from '../../../utils/systemPromptType.js'
+import type { SearchResult, SearchOptions, WebSearchAdapter } from './types.js'
+
+function makeToolSchema(input: { allowedDomains?: string[]; blockedDomains?: string[] }): BetaWebSearchTool20250305 {
+ return {
+ type: 'web_search_20250305',
+ name: 'web_search',
+ allowed_domains: input.allowedDomains,
+ blocked_domains: input.blockedDomains,
+ max_uses: 8,
+ }
+}
+
+export class ApiSearchAdapter implements WebSearchAdapter {
+ async search(
+ query: string,
+ options: SearchOptions,
+ ): Promise {
+ const { signal, onProgress, allowedDomains, blockedDomains } = options
+
+ const userMessage = createUserMessage({
+ content: 'Perform a web search for the query: ' + query,
+ })
+ const toolSchema = makeToolSchema({ allowedDomains, blockedDomains })
+
+ const useHaiku = getFeatureValue_CACHED_MAY_BE_STALE('tengu_plum_vx3', false)
+
+ const queryStream = queryModelWithStreaming({
+ messages: [userMessage],
+ systemPrompt: asSystemPrompt([
+ 'You are an assistant for performing a web search tool use',
+ ]),
+ thinkingConfig: useHaiku
+ ? { type: 'disabled' as const }
+ : { type: 'enabled' as const, budgetTokens: 10000 },
+ tools: [],
+ signal: signal ?? new AbortController().signal,
+ options: {
+ getToolPermissionContext: async () => ({
+ mode: 'default' as const,
+ additionalWorkingDirectories: new Map(),
+ alwaysAllowRules: {},
+ alwaysDenyRules: {},
+ alwaysAskRules: {},
+ isBypassPermissionsModeAvailable: false,
+ }),
+ model: useHaiku ? getSmallFastModel() : getMainLoopModel(),
+ toolChoice: useHaiku ? { type: 'tool' as const, name: 'web_search' } : undefined,
+ isNonInteractiveSession: false,
+ hasAppendSystemPrompt: false,
+ extraToolSchemas: [toolSchema],
+ querySource: 'web_search_tool' as const,
+ agents: [],
+ mcpTools: [],
+ agentId: undefined,
+ effortValue: undefined,
+ },
+ })
+
+ const allContentBlocks: BetaContentBlock[] = []
+ let currentToolUseId: string | null = null
+ let currentToolUseJson = ''
+ const toolUseQueries = new Map()
+ let progressCounter = 0
+
+ for await (const event of queryStream) {
+ if (event.type === 'assistant') {
+ const msg = event as { message: { content: BetaContentBlock[] } }
+ allContentBlocks.push(...msg.message.content)
+ continue
+ }
+
+ if (event.type === 'stream_event') {
+ const streamEvt = event as {
+ event?: {
+ type: string
+ content_block?: { type: string; id?: string; tool_use_id?: string; content?: unknown; [key: string]: unknown }
+ delta?: { type: string; partial_json?: string; [key: string]: unknown }
+ [key: string]: unknown
+ }
+ }
+
+ if (streamEvt.event?.type === 'content_block_start') {
+ const contentBlock = streamEvt.event.content_block
+ if (contentBlock && contentBlock.type === 'server_tool_use') {
+ currentToolUseId = contentBlock.id as string
+ currentToolUseJson = ''
+ continue
+ }
+ }
+
+ if (currentToolUseId && streamEvt.event?.type === 'content_block_delta') {
+ const delta = streamEvt.event.delta
+ if (delta?.type === 'input_json_delta' && delta.partial_json) {
+ currentToolUseJson += delta.partial_json
+ try {
+ const queryMatch = currentToolUseJson.match(
+ /"query"\s*:\s*"((?:[^"\\]|\\.)*)"/,
+ )
+ if (queryMatch && queryMatch[1]) {
+ const parsedQuery = jsonParse('"' + queryMatch[1] + '"')
+ if (
+ !toolUseQueries.has(currentToolUseId) ||
+ toolUseQueries.get(currentToolUseId) !== parsedQuery
+ ) {
+ toolUseQueries.set(currentToolUseId, parsedQuery)
+ progressCounter++
+ onProgress?.({
+ type: 'query_update',
+ query: parsedQuery,
+ })
+ }
+ }
+ } catch {
+ // Ignore parsing errors for partial JSON
+ }
+ }
+ }
+
+ if (streamEvt.event?.type === 'content_block_start') {
+ const contentBlock = streamEvt.event.content_block
+ if (contentBlock && contentBlock.type === 'web_search_tool_result') {
+ const toolUseId = contentBlock.tool_use_id as string
+ const actualQuery = toolUseQueries.get(toolUseId) || query
+ const content = contentBlock.content
+ progressCounter++
+ onProgress?.({
+ type: 'search_results_received',
+ resultCount: Array.isArray(content) ? content.length : 0,
+ query: actualQuery,
+ })
+ }
+ }
+ }
+ }
+
+ // Extract SearchResult[] from content blocks
+ return extractSearchResults(allContentBlocks)
+ }
+}
+
+function extractSearchResults(
+ blocks: BetaContentBlock[],
+): SearchResult[] {
+ const results: SearchResult[] = []
+
+ for (const block of blocks) {
+ if (block.type === 'web_search_tool_result' && Array.isArray(block.content)) {
+ for (const r of block.content as Array<{ title: string; url: string; page_age?: string; type?: string }>) {
+ results.push({
+ title: r.title,
+ url: r.url,
+ })
+ }
+ }
+ }
+
+ return results
+}
diff --git a/src/tools/WebSearchTool/adapters/bingAdapter.ts b/src/tools/WebSearchTool/adapters/bingAdapter.ts
new file mode 100644
index 000000000..da9fefa4d
--- /dev/null
+++ b/src/tools/WebSearchTool/adapters/bingAdapter.ts
@@ -0,0 +1,204 @@
+/**
+ * Bing-based search adapter — fetches Bing search pages and extracts
+ * search results using regex pattern matching on raw HTML.
+ */
+
+import axios from 'axios'
+import he from 'he'
+import { AbortError } from '../../../utils/errors.js'
+import type { SearchResult, SearchOptions, WebSearchAdapter } from './types.js'
+
+const FETCH_TIMEOUT_MS = 30_000
+
+/**
+ * Browser-like headers to avoid Bing's anti-bot JS-rendered response.
+ * These mimic Microsoft Edge on macOS to get full HTML search results.
+ */
+const BROWSER_HEADERS = {
+ 'User-Agent':
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
+ Accept:
+ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
+ 'Accept-Language': 'en-US,en;q=0.9',
+ 'Accept-Encoding': 'gzip, deflate, br',
+ 'Cache-Control': 'no-cache',
+ Pragma: 'no-cache',
+ 'Sec-Ch-Ua': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
+ 'Sec-Ch-Ua-Mobile': '?0',
+ 'Sec-Ch-Ua-Platform': '"macOS"',
+ 'Sec-Fetch-Dest': 'document',
+ 'Sec-Fetch-Mode': 'navigate',
+ 'Sec-Fetch-Site': 'none',
+ 'Sec-Fetch-User': '?1',
+ 'Upgrade-Insecure-Requests': '1',
+} as const
+
+export class BingSearchAdapter implements WebSearchAdapter {
+ async search(
+ query: string,
+ options: SearchOptions,
+ ): Promise {
+ const { signal, onProgress, allowedDomains, blockedDomains } = options
+
+ if (signal?.aborted) {
+ throw new AbortError()
+ }
+
+ onProgress?.({ type: 'query_update', query })
+
+ const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&setmkt=en-US`
+
+ const abortController = new AbortController()
+ if (signal) {
+ signal.addEventListener('abort', () => abortController.abort(), { once: true })
+ }
+
+ let html: string
+ try {
+ const response = await axios.get(url, {
+ signal: abortController.signal,
+ timeout: FETCH_TIMEOUT_MS,
+ responseType: 'text',
+ headers: BROWSER_HEADERS,
+ })
+ html = response.data
+ } catch (e) {
+ if (axios.isCancel(e) || abortController.signal.aborted) {
+ throw new AbortError()
+ }
+ throw e
+ }
+
+ if (abortController.signal.aborted) {
+ throw new AbortError()
+ }
+
+ const rawResults = extractBingResults(html)
+
+ // Client-side domain filtering
+ const results = rawResults.filter((r) => {
+ if (!r.url) return false
+ try {
+ const hostname = new URL(r.url).hostname
+ if (allowedDomains?.length && !allowedDomains.some(d => hostname === d || hostname.endsWith('.' + d))) {
+ return false
+ }
+ if (blockedDomains?.length && blockedDomains.some(d => hostname === d || hostname.endsWith('.' + d))) {
+ return false
+ }
+ } catch {
+ return false
+ }
+ return true
+ })
+
+ onProgress?.({
+ type: 'search_results_received',
+ resultCount: results.length,
+ query,
+ })
+
+ return results
+ }
+}
+
+/**
+ * Extract organic search results from Bing HTML.
+ * Bing results live in blocks within .
+ */
+export function extractBingResults(html: string): SearchResult[] {
+ const results: SearchResult[] = []
+
+ const algoBlockRegex = /- ]*>([\s\S]*?)<\/li>/gi
+ let blockMatch: RegExpExecArray | null
+
+ while ((blockMatch = algoBlockRegex.exec(html)) !== null) {
+ const block = blockMatch[1]
+
+ // Extract the primary link from
+ const h2LinkRegex = /