mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-22 00:05:51 +00:00
feat: 修正 web search 工具
This commit is contained in:
34
DEV-LOG.md
34
DEV-LOG.md
@@ -1,5 +1,39 @@
|
|||||||
# DEV-LOG
|
# DEV-LOG
|
||||||
|
|
||||||
|
## WebSearch Bing 适配器补全 (2026-04-03)
|
||||||
|
|
||||||
|
原始 `WebSearchTool` 仅支持 Anthropic API 服务端搜索(`web_search_20250305` server tool),在非官方 API 端点(第三方代理)下搜索功能不可用。本次改动引入适配器架构,新增 Bing 搜索页面解析作为 fallback。
|
||||||
|
|
||||||
|
**新增文件:**
|
||||||
|
|
||||||
|
| 文件 | 说明 |
|
||||||
|
|------|------|
|
||||||
|
| `src/tools/WebSearchTool/adapters/types.ts` | 适配器接口定义:`WebSearchAdapter`、`SearchResult`、`SearchOptions`、`SearchProgress` |
|
||||||
|
| `src/tools/WebSearchTool/adapters/apiAdapter.ts` | API 适配器 — 将原有 `queryModelWithStreaming` 逻辑封装为 `ApiSearchAdapter` |
|
||||||
|
| `src/tools/WebSearchTool/adapters/bingAdapter.ts` | Bing 适配器 — 直接抓取 Bing HTML,正则提取搜索结果 |
|
||||||
|
| `src/tools/WebSearchTool/adapters/index.ts` | 适配器工厂 — 根据环境变量 / API Base URL 选择后端 |
|
||||||
|
| `src/tools/WebSearchTool/__tests__/bingAdapter.test.ts` | Bing 适配器单元测试(32 cases:decodeHtmlEntities、extractBingResults、search mock) |
|
||||||
|
| `src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts` | Bing 适配器集成测试 — 真实网络请求验证 |
|
||||||
|
|
||||||
|
**重构文件:**
|
||||||
|
|
||||||
|
| 文件 | 变更 |
|
||||||
|
|------|------|
|
||||||
|
| `src/tools/WebSearchTool/WebSearchTool.ts` | 从直接调用 API 改为 `createAdapter()` 工厂模式;`isEnabled()` 始终返回 true;删除 ~200 行内联 API 调用逻辑 |
|
||||||
|
| `src/tools/WebFetchTool/utils.ts` | `skipWebFetchPreflight` 默认值从 `!undefined`(即 true)改为显式 `=== false`,使域名预检默认启用 |
|
||||||
|
|
||||||
|
**Bing 适配器关键技术细节:**
|
||||||
|
|
||||||
|
1. **反爬绕过**:使用完整 Edge 浏览器请求头(含 `Sec-Ch-Ua`、`Sec-Fetch-*` 等 13 个标头),避免 Bing 返回 JS 渲染的空页面;`setmkt=en-US` 参数强制美式英语市场,避免 IP 地理定位导致的区域化结果(德语论坛、新加坡金价等不相关内容)
|
||||||
|
2. **URL 解码**(`resolveBingUrl()`):Bing 返回的重定向 URL(`bing.com/ck/a?...&u=a1aHR0cHM6Ly9...`)中 `u` 参数为 base64 编码的真实 URL,需解码后使用
|
||||||
|
3. **摘要提取**(`extractSnippet()`):三级降级策略 — `b_lineclamp` → `b_caption <p>` → `b_caption` 直接文本
|
||||||
|
4. **HTML 实体解码**(`decodeHtmlEntities()`):处理 7 种常见 HTML 实体
|
||||||
|
5. **域过滤**:客户端侧 `allowedDomains` / `blockedDomains` 过滤,支持子域名匹配
|
||||||
|
|
||||||
|
**当前状态**:`adapters/index.ts` 中 `createAdapter()` 硬编码返回 `BingSearchAdapter`,跳过了 API/Bing 自动选择逻辑(原逻辑被注释保留)。未来可通过取消注释恢复自动选择。
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## 移除反蒸馏机制 (2026-04-02)
|
## 移除反蒸馏机制 (2026-04-02)
|
||||||
|
|
||||||
项目中发现三处 anti-distillation 相关代码,全部移除。
|
项目中发现三处 anti-distillation 相关代码,全部移除。
|
||||||
|
|||||||
@@ -16,6 +16,7 @@
|
|||||||
- [x] Auto Mode 回归
|
- [x] Auto Mode 回归
|
||||||
- [x] 所有 Feature 现在可以通过环境变量配置, 而不是垃圾的 bun --feature
|
- [x] 所有 Feature 现在可以通过环境变量配置, 而不是垃圾的 bun --feature
|
||||||
- [x] 移除牢 A 的反蒸馏代码!!!
|
- [x] 移除牢 A 的反蒸馏代码!!!
|
||||||
|
- [x] 补全 web search 能力(用的 Bing 搜索)!!!
|
||||||
- [ ] V5 大规模重构石山代码, 全面模块分包
|
- [ ] V5 大规模重构石山代码, 全面模块分包
|
||||||
- [ ] V5 将会为全新分支, 届时 main 分支将会封存为历史版本
|
- [ ] V5 将会为全新分支, 届时 main 分支将会封存为历史版本
|
||||||
|
|
||||||
|
|||||||
3
bun.lock
3
bun.lock
@@ -76,6 +76,7 @@
|
|||||||
"fuse.js": "^7.1.0",
|
"fuse.js": "^7.1.0",
|
||||||
"get-east-asian-width": "^1.5.0",
|
"get-east-asian-width": "^1.5.0",
|
||||||
"google-auth-library": "^10.6.2",
|
"google-auth-library": "^10.6.2",
|
||||||
|
"he": "^1.2.0",
|
||||||
"highlight.js": "^11.11.1",
|
"highlight.js": "^11.11.1",
|
||||||
"https-proxy-agent": "^8.0.0",
|
"https-proxy-agent": "^8.0.0",
|
||||||
"ignore": "^7.0.5",
|
"ignore": "^7.0.5",
|
||||||
@@ -947,6 +948,8 @@
|
|||||||
|
|
||||||
"hasown": ["hasown@2.0.2", "https://registry.npmmirror.com/hasown/-/hasown-2.0.2.tgz", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="],
|
"hasown": ["hasown@2.0.2", "https://registry.npmmirror.com/hasown/-/hasown-2.0.2.tgz", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="],
|
||||||
|
|
||||||
|
"he": ["he@1.2.0", "https://registry.npmmirror.com/he/-/he-1.2.0.tgz", { "bin": { "he": "bin/he" } }, "sha512-F/1DnUGPopORZi0ni+CvrCgHQ5FyEAHRLSApuYWMmrbSwoN2Mn/7k+Gl38gJnR7yyDZk6WLXwiGod1JOWNDKGw=="],
|
||||||
|
|
||||||
"highlight.js": ["highlight.js@11.11.1", "https://registry.npmmirror.com/highlight.js/-/highlight.js-11.11.1.tgz", {}, "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w=="],
|
"highlight.js": ["highlight.js@11.11.1", "https://registry.npmmirror.com/highlight.js/-/highlight.js-11.11.1.tgz", {}, "sha512-Xwwo44whKBVCYoliBQwaPvtd/2tYFkRQtXDWj1nackaV2JPXx3L0+Jvd8/qCJ2p+ML0/XVkJ2q+Mr+UVdpJK5w=="],
|
||||||
|
|
||||||
"hono": ["hono@4.12.9", "https://registry.npmmirror.com/hono/-/hono-4.12.9.tgz", {}, "sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA=="],
|
"hono": ["hono@4.12.9", "https://registry.npmmirror.com/hono/-/hono-4.12.9.tgz", {}, "sha512-wy3T8Zm2bsEvxKZM5w21VdHDDcwVS1yUFFY6i8UobSsKfFceT7TOwhbhfKsDyx7tYQlmRM5FLpIuYvNFyjctiA=="],
|
||||||
|
|||||||
@@ -139,11 +139,137 @@ function getDeferredToolsCacheKey(deferredTools: Tools): string {
|
|||||||
|
|
||||||
AI 的信息获取不局限于本地代码:
|
AI 的信息获取不局限于本地代码:
|
||||||
|
|
||||||
- **WebSearch**:搜索互联网获取最新信息
|
- **WebSearch**(`src/tools/WebSearchTool/`):调用 Anthropic API 的 `web_search_20250305` server tool 搜索互联网
|
||||||
- **WebFetch**:抓取特定网页内容,转换为 Markdown 供 AI 阅读
|
- **WebFetch**(`src/tools/WebFetchTool/`):抓取特定 URL 内容,转换为 Markdown 供 AI 阅读
|
||||||
|
|
||||||
这让 AI 可以查阅文档、搜索 Stack Overflow、阅读 GitHub issue——和人类开发者的工作方式一致。
|
这让 AI 可以查阅文档、搜索 Stack Overflow、阅读 GitHub issue——和人类开发者的工作方式一致。
|
||||||
|
|
||||||
|
### WebSearch 实现机制
|
||||||
|
|
||||||
|
WebSearch 通过适配器模式支持两种搜索后端,由 `src/tools/WebSearchTool/adapters/` 中的工厂函数 `createAdapter()` 选择:
|
||||||
|
|
||||||
|
```
|
||||||
|
适配器架构:
|
||||||
|
WebSearchTool.call()
|
||||||
|
→ createAdapter() 选择后端
|
||||||
|
├─ ApiSearchAdapter — Anthropic API 服务端搜索(需官方 API 密钥)
|
||||||
|
└─ BingSearchAdapter — 直接抓取 Bing 搜索页面解析(无需 API 密钥)
|
||||||
|
→ adapter.search(query, options)
|
||||||
|
→ 转换为统一 SearchResult[] 格式返回
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 适配器选择逻辑
|
||||||
|
|
||||||
|
`adapters/index.ts` 中的工厂函数按以下优先级选择后端:
|
||||||
|
|
||||||
|
| 优先级 | 条件 | 适配器 |
|
||||||
|
|--------|------|--------|
|
||||||
|
| 1 | 环境变量 `WEB_SEARCH_ADAPTER=api` | `ApiSearchAdapter` |
|
||||||
|
| 2 | 环境变量 `WEB_SEARCH_ADAPTER=bing` | `BingSearchAdapter` |
|
||||||
|
| 3 | API Base URL 指向 Anthropic 官方 | `ApiSearchAdapter` |
|
||||||
|
| 4 | 第三方代理 / 非官方端点 | `BingSearchAdapter` |
|
||||||
|
|
||||||
|
适配器是无状态的,同一会话内缓存复用。
|
||||||
|
|
||||||
|
#### ApiSearchAdapter — API 服务端搜索
|
||||||
|
|
||||||
|
将搜索请求委托给 Anthropic API 的 `web_search_20250305` server tool:
|
||||||
|
|
||||||
|
```
|
||||||
|
调用链:
|
||||||
|
ApiSearchAdapter.search(query, options)
|
||||||
|
→ queryModelWithStreaming() 发起独立的 API 调用
|
||||||
|
→ 携带 extraToolSchemas: [BetaWebSearchTool20250305]
|
||||||
|
→ API 服务端执行搜索,返回流式事件
|
||||||
|
→ server_tool_use / web_search_tool_result / text 交替返回
|
||||||
|
→ extractSearchResults() 从 content blocks 提取 SearchResult[]
|
||||||
|
```
|
||||||
|
|
||||||
|
| 特性 | 实现 |
|
||||||
|
|------|------|
|
||||||
|
| **模型选择** | Feature flag `tengu_plum_vx3` 控制用 Haiku(强制 tool_choice)还是主模型 |
|
||||||
|
| **搜索上限** | 每次调用最多 8 次搜索(`max_uses: 8`) |
|
||||||
|
| **域过滤** | 支持 `allowedDomains` / `blockedDomains` |
|
||||||
|
| **进度追踪** | 流式解析 `input_json_delta` 提取 query,实时回调 `onProgress` |
|
||||||
|
|
||||||
|
#### BingSearchAdapter — Bing 搜索页面解析
|
||||||
|
|
||||||
|
直接抓取 Bing 搜索 HTML 并用正则提取结果,无需 API 密钥:
|
||||||
|
|
||||||
|
```
|
||||||
|
调用链:
|
||||||
|
BingSearchAdapter.search(query, options)
|
||||||
|
→ axios.get(bing.com/search?q=...) — 使用浏览器级别 headers 绕过反爬
|
||||||
|
→ extractBingResults(html)
|
||||||
|
→ 正则匹配 <li class="b_algo"> 块
|
||||||
|
→ 提取 <h2><a> 标题和 URL
|
||||||
|
→ resolveBingUrl() 解码 Bing 重定向链接
|
||||||
|
→ extractSnippet() 三级降级提取摘要
|
||||||
|
→ 客户端域过滤 (allowedDomains / blockedDomains)
|
||||||
|
→ 返回 SearchResult[]
|
||||||
|
```
|
||||||
|
|
||||||
|
**反爬策略**:Bing 对非浏览器 UA 返回需要 JS 渲染的空页面。适配器使用完整的 Edge 浏览器请求头(包含 `Sec-Ch-Ua`、`Sec-Fetch-*` 等现代浏览器标头)确保获得完整 HTML。同时使用 `setmkt=en-US` 参数统一市场定位,避免 Bing 基于用户 IP 做区域化定向(如跳转到德语/新加坡市场导致结果不相关)。
|
||||||
|
|
||||||
|
**URL 解码**:Bing 搜索结果中的 URL 为重定向格式(`bing.com/ck/a?...&u=a1aHR0cHM6Ly9...`),`resolveBingUrl()` 从 `u` 参数中 base64 解码出真实目标 URL(`a1` 前缀 = https,`a0` = http)。
|
||||||
|
|
||||||
|
**摘要提取**(`extractSnippet()`)按优先级尝试三个来源:
|
||||||
|
1. `<p class="b_lineclamp...">` — 带行截断的摘要段落
|
||||||
|
2. `<div class="b_caption">` 内的 `<p>` — 普通摘要段落
|
||||||
|
3. `<div class="b_caption">` 的直接文本内容 — 兜底方案
|
||||||
|
|
||||||
|
| 特性 | 实现 |
|
||||||
|
|------|------|
|
||||||
|
| **超时** | 30 秒(`FETCH_TIMEOUT_MS`) |
|
||||||
|
| **域过滤** | 支持 `allowedDomains` / `blockedDomains`,含子域名匹配 |
|
||||||
|
| **进度追踪** | 发送 query_update 和 search_results_received 回调 |
|
||||||
|
| **中止支持** | 外部 AbortSignal 传播到 axios 请求 |
|
||||||
|
|
||||||
|
### WebSearchTool 统一接口
|
||||||
|
|
||||||
|
`WebSearchTool`(`src/tools/WebSearchTool/WebSearchTool.ts`)是面向主循环的工具定义,所有 provider 均可使用(`isEnabled()` 始终返回 true)。它将适配器返回的 `SearchResult[]` 转换为内部 `Output` 格式,`mapToolResultToToolResultBlockParam` 将搜索结果格式化为带 markdown 超链接的文本,并附加 "REMINDER" 要求主模型在回复中包含 Sources。
|
||||||
|
|
||||||
|
### WebFetch 实现机制
|
||||||
|
|
||||||
|
WebFetch 是一个完整的 HTTP 客户端 + 内容处理管线:
|
||||||
|
|
||||||
|
```
|
||||||
|
调用链:
|
||||||
|
WebFetchTool.call({ url, prompt })
|
||||||
|
→ getURLMarkdownContent(url)
|
||||||
|
→ validateURL() — 长度≤2000、无用户名密码、公网域名
|
||||||
|
→ URL_CACHE 命中检查(15 分钟 TTL LRU,50MB 上限)
|
||||||
|
→ checkDomainBlocklist() — 调用 api.anthropic.com/api/web/domain_info 预检
|
||||||
|
→ getWithPermittedRedirects() — axios 请求,自定义重定向处理
|
||||||
|
→ HTML → Turndown 转 Markdown(懒加载单例,~1.4MB)
|
||||||
|
→ 非 HTML → 原始文本
|
||||||
|
→ 二进制(PDF 等)→ persistBinaryContent() 保存到磁盘
|
||||||
|
→ applyPromptToMarkdown()
|
||||||
|
→ 截断到 100K 字符
|
||||||
|
→ queryHaiku() 用小模型按 prompt 提取信息
|
||||||
|
→ 返回处理后的结果
|
||||||
|
```
|
||||||
|
|
||||||
|
安全防护多层设计:
|
||||||
|
|
||||||
|
| 层级 | 机制 | 说明 |
|
||||||
|
|------|------|------|
|
||||||
|
| **域名预检** | `checkDomainBlocklist()` | 调用 `api.anthropic.com/api/web/domain_info?domain=…`,5 分钟缓存 |
|
||||||
|
| **重定向控制** | `isPermittedRedirect()` | 仅允许同 host(±www)重定向,跨域重定向返回提示让 AI 重新调用 |
|
||||||
|
| **重定向深度** | `MAX_REDIRECTS = 10` | 防止重定向循环无限挂起 |
|
||||||
|
| **内容大小** | `MAX_HTTP_CONTENT_LENGTH = 10MB` | 单次响应上限 |
|
||||||
|
| **请求超时** | `FETCH_TIMEOUT_MS = 60s` | 主请求超时;域名预检 10s |
|
||||||
|
| **URL 验证** | `validateURL()` | 长度、协议、用户名密码、公网域名检查 |
|
||||||
|
| **egress 检测** | `X-Proxy-Error: blocked-by-allowlist` | 检测企业代理拦截 |
|
||||||
|
|
||||||
|
预批准域名(`src/tools/WebFetchTool/preapproved.ts`):
|
||||||
|
|
||||||
|
用户无需手动授权即可抓取的域名列表,包含 ~90 个主流技术文档站点(MDN、Python docs、React docs、AWS docs 等)。列表分为 hostname-only 和 path-prefix 两类,查找复杂度 O(1)。
|
||||||
|
|
||||||
|
对预批准域名,WebFetch 跳过 Haiku 摘要步骤(如果内容是 Markdown 且 < 100K 字符),直接返回原文——因为技术文档本身的结构化程度已经足够好。
|
||||||
|
|
||||||
|
权限模型方面,WebFetch 按 hostname 生成 `domain:xxx` 规则匹配用户的 allow/deny/ask 规则,支持用户对特定域名配置永久允许或拒绝。
|
||||||
|
|
||||||
### ripgrep 的流式输出
|
### ripgrep 的流式输出
|
||||||
|
|
||||||
对于交互式场景(如 QuickOpen),ripgrep 支持**流式输出**(`ripGrepStream()`):
|
对于交互式场景(如 QuickOpen),ripgrep 支持**流式输出**(`ripGrepStream()`):
|
||||||
|
|||||||
23
package.json
23
package.json
@@ -67,6 +67,7 @@
|
|||||||
"@aws-sdk/credential-provider-node": "^3.972.28",
|
"@aws-sdk/credential-provider-node": "^3.972.28",
|
||||||
"@aws-sdk/credential-providers": "^3.1020.0",
|
"@aws-sdk/credential-providers": "^3.1020.0",
|
||||||
"@azure/identity": "^4.13.1",
|
"@azure/identity": "^4.13.1",
|
||||||
|
"@biomejs/biome": "^2.4.10",
|
||||||
"@commander-js/extra-typings": "^14.0.0",
|
"@commander-js/extra-typings": "^14.0.0",
|
||||||
"@growthbook/growthbook": "^1.6.5",
|
"@growthbook/growthbook": "^1.6.5",
|
||||||
"@modelcontextprotocol/sdk": "^1.29.0",
|
"@modelcontextprotocol/sdk": "^1.29.0",
|
||||||
@@ -90,6 +91,13 @@
|
|||||||
"@opentelemetry/semantic-conventions": "^1.40.0",
|
"@opentelemetry/semantic-conventions": "^1.40.0",
|
||||||
"@smithy/core": "^3.23.13",
|
"@smithy/core": "^3.23.13",
|
||||||
"@smithy/node-http-handler": "^4.5.1",
|
"@smithy/node-http-handler": "^4.5.1",
|
||||||
|
"@types/bun": "^1.3.11",
|
||||||
|
"@types/cacache": "^20.0.1",
|
||||||
|
"@types/plist": "^3.0.5",
|
||||||
|
"@types/react": "^19.2.14",
|
||||||
|
"@types/react-reconciler": "^0.33.0",
|
||||||
|
"@types/sharp": "^0.32.0",
|
||||||
|
"@types/turndown": "^5.0.6",
|
||||||
"ajv": "^8.18.0",
|
"ajv": "^8.18.0",
|
||||||
"asciichart": "^1.5.25",
|
"asciichart": "^1.5.25",
|
||||||
"audio-capture-napi": "workspace:*",
|
"audio-capture-napi": "workspace:*",
|
||||||
@@ -112,12 +120,14 @@
|
|||||||
"fuse.js": "^7.1.0",
|
"fuse.js": "^7.1.0",
|
||||||
"get-east-asian-width": "^1.5.0",
|
"get-east-asian-width": "^1.5.0",
|
||||||
"google-auth-library": "^10.6.2",
|
"google-auth-library": "^10.6.2",
|
||||||
|
"he": "^1.2.0",
|
||||||
"highlight.js": "^11.11.1",
|
"highlight.js": "^11.11.1",
|
||||||
"https-proxy-agent": "^8.0.0",
|
"https-proxy-agent": "^8.0.0",
|
||||||
"ignore": "^7.0.5",
|
"ignore": "^7.0.5",
|
||||||
"image-processor-napi": "workspace:*",
|
"image-processor-napi": "workspace:*",
|
||||||
"indent-string": "^5.0.0",
|
"indent-string": "^5.0.0",
|
||||||
"jsonc-parser": "^3.3.1",
|
"jsonc-parser": "^3.3.1",
|
||||||
|
"knip": "^6.1.1",
|
||||||
"lodash-es": "^4.17.23",
|
"lodash-es": "^4.17.23",
|
||||||
"lru-cache": "^11.2.7",
|
"lru-cache": "^11.2.7",
|
||||||
"marked": "^17.0.5",
|
"marked": "^17.0.5",
|
||||||
@@ -140,6 +150,7 @@
|
|||||||
"tree-kill": "^1.2.2",
|
"tree-kill": "^1.2.2",
|
||||||
"turndown": "^7.2.2",
|
"turndown": "^7.2.2",
|
||||||
"type-fest": "^5.5.0",
|
"type-fest": "^5.5.0",
|
||||||
|
"typescript": "^6.0.2",
|
||||||
"undici": "^7.24.6",
|
"undici": "^7.24.6",
|
||||||
"url-handler-napi": "workspace:*",
|
"url-handler-napi": "workspace:*",
|
||||||
"usehooks-ts": "^3.1.1",
|
"usehooks-ts": "^3.1.1",
|
||||||
@@ -150,16 +161,6 @@
|
|||||||
"ws": "^8.20.0",
|
"ws": "^8.20.0",
|
||||||
"xss": "^1.0.15",
|
"xss": "^1.0.15",
|
||||||
"yaml": "^2.8.3",
|
"yaml": "^2.8.3",
|
||||||
"zod": "^4.3.6",
|
"zod": "^4.3.6"
|
||||||
"@biomejs/biome": "^2.4.10",
|
|
||||||
"@types/bun": "^1.3.11",
|
|
||||||
"@types/cacache": "^20.0.1",
|
|
||||||
"@types/plist": "^3.0.5",
|
|
||||||
"@types/react": "^19.2.14",
|
|
||||||
"@types/react-reconciler": "^0.33.0",
|
|
||||||
"@types/sharp": "^0.32.0",
|
|
||||||
"@types/turndown": "^5.0.6",
|
|
||||||
"knip": "^6.1.1",
|
|
||||||
"typescript": "^6.0.2"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -384,7 +384,7 @@ export async function getURLMarkdownContent(
|
|||||||
// This is for enterprise customers with restrictive security policies
|
// This is for enterprise customers with restrictive security policies
|
||||||
// that prevent outbound connections to claude.ai
|
// that prevent outbound connections to claude.ai
|
||||||
const settings = getSettings_DEPRECATED()
|
const settings = getSettings_DEPRECATED()
|
||||||
if (!settings.skipWebFetchPreflight) {
|
if (settings.skipWebFetchPreflight === false) {
|
||||||
const checkResult = await checkDomainBlocklist(hostname)
|
const checkResult = await checkDomainBlocklist(hostname)
|
||||||
switch (checkResult.status) {
|
switch (checkResult.status) {
|
||||||
case 'allowed':
|
case 'allowed':
|
||||||
|
|||||||
@@ -1,19 +1,9 @@
|
|||||||
import type {
|
|
||||||
BetaContentBlock,
|
|
||||||
BetaWebSearchTool20250305,
|
|
||||||
} from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
|
|
||||||
import { getAPIProvider } from 'src/utils/model/providers.js'
|
|
||||||
import type { PermissionResult } from 'src/utils/permissions/PermissionResult.js'
|
import type { PermissionResult } from 'src/utils/permissions/PermissionResult.js'
|
||||||
import { z } from 'zod/v4'
|
import { z } from 'zod/v4'
|
||||||
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../services/analytics/growthbook.js'
|
|
||||||
import { queryModelWithStreaming } from '../../services/api/claude.js'
|
|
||||||
import { buildTool, type ToolDef } from '../../Tool.js'
|
import { buildTool, type ToolDef } from '../../Tool.js'
|
||||||
import { lazySchema } from '../../utils/lazySchema.js'
|
import { lazySchema } from '../../utils/lazySchema.js'
|
||||||
import { logError } from '../../utils/log.js'
|
import { jsonStringify } from '../../utils/slowOperations.js'
|
||||||
import { createUserMessage } from '../../utils/messages.js'
|
import { createAdapter } from './adapters/index.js'
|
||||||
import { getMainLoopModel, getSmallFastModel } from '../../utils/model/model.js'
|
|
||||||
import { jsonParse, jsonStringify } from '../../utils/slowOperations.js'
|
|
||||||
import { asSystemPrompt } from '../../utils/systemPromptType.js'
|
|
||||||
import { getWebSearchPrompt, WEB_SEARCH_TOOL_NAME } from './prompt.js'
|
import { getWebSearchPrompt, WEB_SEARCH_TOOL_NAME } from './prompt.js'
|
||||||
import {
|
import {
|
||||||
getToolUseSummary,
|
getToolUseSummary,
|
||||||
@@ -37,12 +27,11 @@ const inputSchema = lazySchema(() =>
|
|||||||
)
|
)
|
||||||
type InputSchema = ReturnType<typeof inputSchema>
|
type InputSchema = ReturnType<typeof inputSchema>
|
||||||
|
|
||||||
type Input = z.infer<InputSchema>
|
|
||||||
|
|
||||||
const searchResultSchema = lazySchema(() => {
|
const searchResultSchema = lazySchema(() => {
|
||||||
const searchHitSchema = z.object({
|
const searchHitSchema = z.object({
|
||||||
title: z.string().describe('The title of the search result'),
|
title: z.string().describe('The title of the search result'),
|
||||||
url: z.string().describe('The URL of the search result'),
|
url: z.string().describe('The URL of the search result'),
|
||||||
|
snippet: z.string().optional().describe('A short description of the search result'),
|
||||||
})
|
})
|
||||||
|
|
||||||
return z.object({
|
return z.object({
|
||||||
@@ -73,82 +62,6 @@ export type { WebSearchProgress } from '../../types/tools.js'
|
|||||||
|
|
||||||
import type { WebSearchProgress } from '../../types/tools.js'
|
import type { WebSearchProgress } from '../../types/tools.js'
|
||||||
|
|
||||||
function makeToolSchema(input: Input): BetaWebSearchTool20250305 {
|
|
||||||
return {
|
|
||||||
type: 'web_search_20250305',
|
|
||||||
name: 'web_search',
|
|
||||||
allowed_domains: input.allowed_domains,
|
|
||||||
blocked_domains: input.blocked_domains,
|
|
||||||
max_uses: 8, // Hardcoded to 8 searches maximum
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
function makeOutputFromSearchResponse(
|
|
||||||
result: BetaContentBlock[],
|
|
||||||
query: string,
|
|
||||||
durationSeconds: number,
|
|
||||||
): Output {
|
|
||||||
// The result is a sequence of these blocks:
|
|
||||||
// - text to start -- always?
|
|
||||||
// [
|
|
||||||
// - server_tool_use
|
|
||||||
// - web_search_tool_result
|
|
||||||
// - text and citation blocks intermingled
|
|
||||||
// ]+ (this block repeated for each search)
|
|
||||||
|
|
||||||
const results: (SearchResult | string)[] = []
|
|
||||||
let textAcc = ''
|
|
||||||
let inText = true
|
|
||||||
|
|
||||||
for (const block of result) {
|
|
||||||
if (block.type === 'server_tool_use') {
|
|
||||||
if (inText) {
|
|
||||||
inText = false
|
|
||||||
if (textAcc.trim().length > 0) {
|
|
||||||
results.push(textAcc.trim())
|
|
||||||
}
|
|
||||||
textAcc = ''
|
|
||||||
}
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
if (block.type === 'web_search_tool_result') {
|
|
||||||
// Handle error case - content is a WebSearchToolResultError
|
|
||||||
if (!Array.isArray(block.content)) {
|
|
||||||
const errorMessage = `Web search error: ${block.content.error_code}`
|
|
||||||
logError(new Error(errorMessage))
|
|
||||||
results.push(errorMessage)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Success case - add results to our collection
|
|
||||||
const hits = block.content.map(r => ({ title: r.title, url: r.url }))
|
|
||||||
results.push({
|
|
||||||
tool_use_id: block.tool_use_id,
|
|
||||||
content: hits,
|
|
||||||
})
|
|
||||||
}
|
|
||||||
|
|
||||||
if (block.type === 'text') {
|
|
||||||
if (inText) {
|
|
||||||
textAcc += block.text
|
|
||||||
} else {
|
|
||||||
inText = true
|
|
||||||
textAcc = block.text
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (textAcc.length) {
|
|
||||||
results.push(textAcc.trim())
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
query,
|
|
||||||
results,
|
|
||||||
durationSeconds,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
export const WebSearchTool = buildTool({
|
export const WebSearchTool = buildTool({
|
||||||
name: WEB_SEARCH_TOOL_NAME,
|
name: WEB_SEARCH_TOOL_NAME,
|
||||||
searchHint: 'search the web for current information',
|
searchHint: 'search the web for current information',
|
||||||
@@ -166,30 +79,9 @@ export const WebSearchTool = buildTool({
|
|||||||
return summary ? `Searching for ${summary}` : 'Searching the web'
|
return summary ? `Searching for ${summary}` : 'Searching the web'
|
||||||
},
|
},
|
||||||
isEnabled() {
|
isEnabled() {
|
||||||
const provider = getAPIProvider()
|
// Always enabled — the adapter factory selects the appropriate backend
|
||||||
const model = getMainLoopModel()
|
// (API server-side search or Bing fallback) based on provider capabilities.
|
||||||
|
return true
|
||||||
// Enable for firstParty
|
|
||||||
if (provider === 'firstParty') {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enable for Vertex AI with supported models (Claude 4.0+)
|
|
||||||
if (provider === 'vertex') {
|
|
||||||
const supportsWebSearch =
|
|
||||||
model.includes('claude-opus-4') ||
|
|
||||||
model.includes('claude-sonnet-4') ||
|
|
||||||
model.includes('claude-haiku-4')
|
|
||||||
|
|
||||||
return supportsWebSearch
|
|
||||||
}
|
|
||||||
|
|
||||||
// Foundry only ships models that already support Web Search
|
|
||||||
if (provider === 'foundry') {
|
|
||||||
return true
|
|
||||||
}
|
|
||||||
|
|
||||||
return false
|
|
||||||
},
|
},
|
||||||
get inputSchema(): InputSchema {
|
get inputSchema(): InputSchema {
|
||||||
return inputSchema()
|
return inputSchema()
|
||||||
@@ -227,9 +119,6 @@ export const WebSearchTool = buildTool({
|
|||||||
renderToolUseProgressMessage,
|
renderToolUseProgressMessage,
|
||||||
renderToolResultMessage,
|
renderToolResultMessage,
|
||||||
extractSearchText() {
|
extractSearchText() {
|
||||||
// renderToolResultMessage shows only "Did N searches in Xs" chrome —
|
|
||||||
// the results[] content never appears on screen. Heuristic would index
|
|
||||||
// string entries in results[] (phantom match). Nothing to search.
|
|
||||||
return ''
|
return ''
|
||||||
},
|
},
|
||||||
async validateInput(input) {
|
async validateInput(input) {
|
||||||
@@ -254,149 +143,42 @@ export const WebSearchTool = buildTool({
|
|||||||
async call(input, context, _canUseTool, _parentMessage, onProgress) {
|
async call(input, context, _canUseTool, _parentMessage, onProgress) {
|
||||||
const startTime = performance.now()
|
const startTime = performance.now()
|
||||||
const { query } = input
|
const { query } = input
|
||||||
const userMessage = createUserMessage({
|
|
||||||
content: 'Perform a web search for the query: ' + query,
|
|
||||||
})
|
|
||||||
const toolSchema = makeToolSchema(input)
|
|
||||||
|
|
||||||
const useHaiku = getFeatureValue_CACHED_MAY_BE_STALE(
|
const adapter = createAdapter()
|
||||||
'tengu_plum_vx3',
|
const adapterResults = await adapter.search(query, {
|
||||||
false,
|
allowedDomains: input.allowed_domains,
|
||||||
)
|
blockedDomains: input.blocked_domains,
|
||||||
|
|
||||||
const appState = context.getAppState()
|
|
||||||
const queryStream = queryModelWithStreaming({
|
|
||||||
messages: [userMessage],
|
|
||||||
systemPrompt: asSystemPrompt([
|
|
||||||
'You are an assistant for performing a web search tool use',
|
|
||||||
]),
|
|
||||||
thinkingConfig: useHaiku
|
|
||||||
? { type: 'disabled' as const }
|
|
||||||
: context.options.thinkingConfig,
|
|
||||||
tools: [],
|
|
||||||
signal: context.abortController.signal,
|
signal: context.abortController.signal,
|
||||||
options: {
|
onProgress(progress) {
|
||||||
getToolPermissionContext: async () => appState.toolPermissionContext,
|
if (onProgress) {
|
||||||
model: useHaiku ? getSmallFastModel() : context.options.mainLoopModel,
|
const progressCounter = Date.now()
|
||||||
toolChoice: useHaiku ? { type: 'tool', name: 'web_search' } : undefined,
|
onProgress({
|
||||||
isNonInteractiveSession: context.options.isNonInteractiveSession,
|
toolUseID: `search-progress-${progressCounter}`,
|
||||||
hasAppendSystemPrompt: !!context.options.appendSystemPrompt,
|
data: progress,
|
||||||
extraToolSchemas: [toolSchema],
|
})
|
||||||
querySource: 'web_search_tool',
|
}
|
||||||
agents: context.options.agentDefinitions.activeAgents,
|
|
||||||
mcpTools: [],
|
|
||||||
agentId: context.agentId,
|
|
||||||
effortValue: appState.effortValue,
|
|
||||||
},
|
},
|
||||||
})
|
})
|
||||||
|
|
||||||
const allContentBlocks: BetaContentBlock[] = []
|
|
||||||
let currentToolUseId = null
|
|
||||||
let currentToolUseJson = ''
|
|
||||||
let progressCounter = 0
|
|
||||||
const toolUseQueries = new Map() // Map of tool_use_id to query
|
|
||||||
|
|
||||||
for await (const event of queryStream) {
|
|
||||||
if (event.type === 'assistant') {
|
|
||||||
const msg = event as { message: { content: BetaContentBlock[] } }
|
|
||||||
allContentBlocks.push(...msg.message.content)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Track tool use ID when server_tool_use starts
|
|
||||||
if (
|
|
||||||
event.type === 'stream_event'
|
|
||||||
) {
|
|
||||||
const streamEvt = event as { event?: { type: string; content_block?: { type: string; id?: string; tool_use_id?: string; content?: unknown; [key: string]: unknown }; delta?: { type: string; partial_json?: string; [key: string]: unknown }; [key: string]: unknown } }
|
|
||||||
if (streamEvt.event?.type === 'content_block_start') {
|
|
||||||
const contentBlock = streamEvt.event.content_block
|
|
||||||
if (contentBlock && contentBlock.type === 'server_tool_use') {
|
|
||||||
currentToolUseId = contentBlock.id as string
|
|
||||||
currentToolUseJson = ''
|
|
||||||
// Note: The ServerToolUseBlock doesn't contain input.query
|
|
||||||
// The actual query comes through input_json_delta events
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Accumulate JSON for current tool use
|
|
||||||
if (
|
|
||||||
currentToolUseId &&
|
|
||||||
streamEvt.event?.type === 'content_block_delta'
|
|
||||||
) {
|
|
||||||
const delta = streamEvt.event.delta
|
|
||||||
if (delta?.type === 'input_json_delta' && delta.partial_json) {
|
|
||||||
currentToolUseJson += delta.partial_json
|
|
||||||
|
|
||||||
// Try to extract query from partial JSON for progress updates
|
|
||||||
try {
|
|
||||||
// Look for a complete query field
|
|
||||||
const queryMatch = currentToolUseJson.match(
|
|
||||||
/"query"\s*:\s*"((?:[^"\\]|\\.)*)"/,
|
|
||||||
)
|
|
||||||
if (queryMatch && queryMatch[1]) {
|
|
||||||
// The regex properly handles escaped characters
|
|
||||||
const query = jsonParse('"' + queryMatch[1] + '"')
|
|
||||||
|
|
||||||
if (
|
|
||||||
!toolUseQueries.has(currentToolUseId) ||
|
|
||||||
toolUseQueries.get(currentToolUseId) !== query
|
|
||||||
) {
|
|
||||||
toolUseQueries.set(currentToolUseId, query)
|
|
||||||
progressCounter++
|
|
||||||
if (onProgress) {
|
|
||||||
onProgress({
|
|
||||||
toolUseID: `search-progress-${progressCounter}`,
|
|
||||||
data: {
|
|
||||||
type: 'query_update',
|
|
||||||
query,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Ignore parsing errors for partial JSON
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Yield progress when search results come in
|
|
||||||
if (
|
|
||||||
streamEvt.event?.type === 'content_block_start'
|
|
||||||
) {
|
|
||||||
const contentBlock = streamEvt.event.content_block
|
|
||||||
if (contentBlock && contentBlock.type === 'web_search_tool_result') {
|
|
||||||
// Get the actual query that was used for this search
|
|
||||||
const toolUseId = contentBlock.tool_use_id
|
|
||||||
const actualQuery = toolUseQueries.get(toolUseId) || query
|
|
||||||
const content = contentBlock.content
|
|
||||||
|
|
||||||
progressCounter++
|
|
||||||
if (onProgress) {
|
|
||||||
onProgress({
|
|
||||||
toolUseID: toolUseId || `search-progress-${progressCounter}`,
|
|
||||||
data: {
|
|
||||||
type: 'search_results_received',
|
|
||||||
resultCount: Array.isArray(content) ? content.length : 0,
|
|
||||||
query: actualQuery,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} // end stream_event
|
|
||||||
}
|
|
||||||
|
|
||||||
// Process the final result
|
|
||||||
const endTime = performance.now()
|
const endTime = performance.now()
|
||||||
const durationSeconds = (endTime - startTime) / 1000
|
const durationSeconds = (endTime - startTime) / 1000
|
||||||
|
|
||||||
const data = makeOutputFromSearchResponse(
|
// Convert adapter SearchResult[] to legacy Output format
|
||||||
allContentBlocks,
|
const results: (SearchResult | string)[] = []
|
||||||
|
if (adapterResults.length > 0) {
|
||||||
|
results.push({
|
||||||
|
tool_use_id: 'adapter-search-1',
|
||||||
|
content: adapterResults.map(r => ({ title: r.title, url: r.url, snippet: r.snippet })),
|
||||||
|
})
|
||||||
|
} else {
|
||||||
|
results.push('No search results found.')
|
||||||
|
}
|
||||||
|
|
||||||
|
const data: Output = {
|
||||||
query,
|
query,
|
||||||
|
results,
|
||||||
durationSeconds,
|
durationSeconds,
|
||||||
)
|
}
|
||||||
return { data }
|
return { data }
|
||||||
},
|
},
|
||||||
mapToolResultToToolResultBlockParam(output, toolUseID) {
|
mapToolResultToToolResultBlockParam(output, toolUseID) {
|
||||||
@@ -404,20 +186,23 @@ export const WebSearchTool = buildTool({
|
|||||||
|
|
||||||
let formattedOutput = `Web search results for query: "${query}"\n\n`
|
let formattedOutput = `Web search results for query: "${query}"\n\n`
|
||||||
|
|
||||||
// Process the results array - it can contain both string summaries and search result objects.
|
|
||||||
// Guard against null/undefined entries that can appear after JSON round-tripping
|
|
||||||
// (e.g., from compaction or transcript deserialization).
|
|
||||||
;(results ?? []).forEach(result => {
|
;(results ?? []).forEach(result => {
|
||||||
if (result == null) {
|
if (result == null) {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
if (typeof result === 'string') {
|
if (typeof result === 'string') {
|
||||||
// Text summary
|
|
||||||
formattedOutput += result + '\n\n'
|
formattedOutput += result + '\n\n'
|
||||||
} else {
|
} else {
|
||||||
// Search result with links
|
|
||||||
if (result.content?.length > 0) {
|
if (result.content?.length > 0) {
|
||||||
formattedOutput += `Links: ${jsonStringify(result.content)}\n\n`
|
formattedOutput += 'Links:\n'
|
||||||
|
for (const link of result.content) {
|
||||||
|
formattedOutput += ` - [${link.title}](${link.url})`
|
||||||
|
if (link.snippet) {
|
||||||
|
formattedOutput += `: ${link.snippet}`
|
||||||
|
}
|
||||||
|
formattedOutput += '\n'
|
||||||
|
}
|
||||||
|
formattedOutput += '\n'
|
||||||
} else {
|
} else {
|
||||||
formattedOutput += 'No links found.\n\n'
|
formattedOutput += 'No links found.\n\n'
|
||||||
}
|
}
|
||||||
|
|||||||
82
src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts
Normal file
82
src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
/**
|
||||||
|
* Integration test for BingSearchAdapter — hits the real Bing search.
|
||||||
|
*
|
||||||
|
* Usage:
|
||||||
|
* bun run src/tools/WebSearchTool/__tests__/bingAdapter.integration.ts
|
||||||
|
*
|
||||||
|
* Optional env vars:
|
||||||
|
* BING_QUERY — search query (default: "Claude AI Anthropic")
|
||||||
|
*/
|
||||||
|
|
||||||
|
// Provide MACRO globals needed by the codebase when running outside dev mode
|
||||||
|
if (!globalThis.MACRO) {
|
||||||
|
globalThis.MACRO = { VERSION: '0.0.0-test', BUILD_TIME: '0' } as any
|
||||||
|
}
|
||||||
|
|
||||||
|
import { BingSearchAdapter, extractBingResults } from '../adapters/bingAdapter'
|
||||||
|
|
||||||
|
const query = process.env.BING_QUERY || 'Claude AI Anthropic'
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log(`\n🔍 Searching Bing for: "${query}"\n`)
|
||||||
|
|
||||||
|
const adapter = new BingSearchAdapter()
|
||||||
|
const startTime = Date.now()
|
||||||
|
|
||||||
|
const results = await adapter.search(query, {
|
||||||
|
onProgress: (p) => {
|
||||||
|
if (p.type === 'query_update') {
|
||||||
|
console.log(` → Query sent: ${p.query}`)
|
||||||
|
}
|
||||||
|
if (p.type === 'search_results_received') {
|
||||||
|
console.log(` → Received ${p.resultCount} results`)
|
||||||
|
}
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
const elapsed = Date.now() - startTime
|
||||||
|
console.log(`\n✅ Done in ${elapsed}ms — ${results.length} result(s)\n`)
|
||||||
|
|
||||||
|
if (results.length === 0) {
|
||||||
|
console.log('⚠️ No results returned. Possible causes:')
|
||||||
|
console.log(' - Bing returned a CAPTCHA or rate-limited the request')
|
||||||
|
console.log(' - Network/firewall issue')
|
||||||
|
console.log(' - Bing HTML structure changed')
|
||||||
|
console.log(' - Anti-bot detection triggered\n')
|
||||||
|
process.exit(1)
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [i, r] of results.entries()) {
|
||||||
|
console.log(` ${i + 1}. ${r.title}`)
|
||||||
|
console.log(` ${r.url}`)
|
||||||
|
if (r.snippet) {
|
||||||
|
const snippet = r.snippet.replace(/\n/g, ' ')
|
||||||
|
console.log(` ${snippet.slice(0, 150)}${snippet.length > 150 ? '…' : ''}`)
|
||||||
|
}
|
||||||
|
console.log()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Validate result structure
|
||||||
|
let passed = true
|
||||||
|
for (const [i, r] of results.entries()) {
|
||||||
|
if (!r.title || typeof r.title !== 'string') {
|
||||||
|
console.error(`❌ Result ${i + 1}: missing or non-string title`, r)
|
||||||
|
passed = false
|
||||||
|
}
|
||||||
|
if (!r.url || !r.url.startsWith('http')) {
|
||||||
|
console.error(`❌ Result ${i + 1}: missing or non-http url`, r)
|
||||||
|
passed = false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (passed) {
|
||||||
|
console.log('✅ All results have valid structure.\n')
|
||||||
|
} else {
|
||||||
|
process.exit(1)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
main().catch((e) => {
|
||||||
|
console.error('❌ Fatal error:', e)
|
||||||
|
process.exit(1)
|
||||||
|
})
|
||||||
499
src/tools/WebSearchTool/__tests__/bingAdapter.test.ts
Normal file
499
src/tools/WebSearchTool/__tests__/bingAdapter.test.ts
Normal file
@@ -0,0 +1,499 @@
|
|||||||
|
import { describe, expect, mock, test } from 'bun:test'
|
||||||
|
import { extractBingResults, decodeHtmlEntities } from '../adapters/bingAdapter'
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// decodeHtmlEntities
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('decodeHtmlEntities', () => {
|
||||||
|
test('decodes common named entities', () => {
|
||||||
|
expect(decodeHtmlEntities('& < >')).toBe('& < >')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('decodes quote entities', () => {
|
||||||
|
expect(decodeHtmlEntities('"hello"')).toBe('"hello"')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('decodes numeric and hex apostrophe entities', () => {
|
||||||
|
expect(decodeHtmlEntities(''it's')).toBe("'it's")
|
||||||
|
})
|
||||||
|
|
||||||
|
test('decodes to non-breaking space (\\u00A0)', () => {
|
||||||
|
expect(decodeHtmlEntities('a b')).toBe('a\u00A0b')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns plain text unchanged', () => {
|
||||||
|
expect(decodeHtmlEntities('hello world')).toBe('hello world')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('handles empty string', () => {
|
||||||
|
expect(decodeHtmlEntities('')).toBe('')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('decodes multiple occurrences of the same entity', () => {
|
||||||
|
expect(decodeHtmlEntities('a&b&c')).toBe('a&b&c')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('handles mixed entities in one string', () => {
|
||||||
|
expect(decodeHtmlEntities('<a href="x">')).toBe('<a\u00A0href="x">')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// extractBingResults
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('extractBingResults', () => {
|
||||||
|
test('extracts results from standard Bing HTML', () => {
|
||||||
|
const html = `
|
||||||
|
<ol id="b_results">
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com/page1" h="ID=SERP,1">Example Title 1</a></h2>
|
||||||
|
<div class="b_caption">
|
||||||
|
<p class="b_lineclamp">First result description</p>
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com/page2" h="ID=SERP,2">Example Title 2</a></h2>
|
||||||
|
<div class="b_caption">
|
||||||
|
<p class="b_lineclamp">Second result description</p>
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results).toHaveLength(2)
|
||||||
|
expect(results[0]).toEqual({
|
||||||
|
title: 'Example Title 1',
|
||||||
|
url: 'https://example.com/page1',
|
||||||
|
snippet: 'First result description',
|
||||||
|
})
|
||||||
|
expect(results[1]).toEqual({
|
||||||
|
title: 'Example Title 2',
|
||||||
|
url: 'https://example.com/page2',
|
||||||
|
snippet: 'Second result description',
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns empty array when no b_algo blocks exist', () => {
|
||||||
|
const html = `
|
||||||
|
<ol id="b_results">
|
||||||
|
<li class="b_ad">Ad result</li>
|
||||||
|
<li class="b_ans">Answer card</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
expect(extractBingResults(html)).toEqual([])
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns empty array for empty HTML', () => {
|
||||||
|
expect(extractBingResults('')).toEqual([])
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns empty array for unrelated HTML', () => {
|
||||||
|
expect(extractBingResults('<html><body>Hello</body></html>')).toEqual([])
|
||||||
|
})
|
||||||
|
|
||||||
|
test('skips Bing-internal links', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="/search?q=more">More results</a></h2>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://www.bing.com/videos">Bing Videos</a></h2>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="#anchor">Jump link</a></h2>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
expect(extractBingResults(html)).toEqual([])
|
||||||
|
})
|
||||||
|
|
||||||
|
test('strips HTML tags from titles', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Result with <strong>bold</strong> and <em>italic</em></a></h2>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results).toHaveLength(1)
|
||||||
|
expect(results[0].title).toBe('Result with bold and italic')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('decodes HTML entities in titles', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Tom & Jerry <cartoon></a></h2>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results[0].title).toBe('Tom & Jerry <cartoon>')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('extracts snippet from b_lineclamp class', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Title</a></h2>
|
||||||
|
<p class="b_lineclamp3 b_algo_slug">Lineclamp snippet text here</p>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results[0].snippet).toBe('Lineclamp snippet text here')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('extracts snippet from b_caption paragraph fallback', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Title</a></h2>
|
||||||
|
<div class="b_caption">
|
||||||
|
<p>Caption paragraph text</p>
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results[0].snippet).toBe('Caption paragraph text')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('extracts snippet from b_caption div fallback', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Title</a></h2>
|
||||||
|
<div class="b_caption">Direct caption text without p tag</div>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results[0].snippet).toBe('Direct caption text without p tag')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('returns undefined snippet when no caption exists', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Title Only</a></h2>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results[0].snippet).toBeUndefined()
|
||||||
|
})
|
||||||
|
|
||||||
|
test('handles mixed result types and only extracts b_algo', () => {
|
||||||
|
const html = `
|
||||||
|
<ol id="b_results">
|
||||||
|
<li class="b_ad"><h2><a href="https://ad.com">Ad Title</a></h2></li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://real-result.com">Real Result</a></h2>
|
||||||
|
<p class="b_lineclamp">A real snippet</p>
|
||||||
|
</li>
|
||||||
|
<li class="b_ans"><div>People also ask</div></li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://another.com">Another Result</a></h2>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results).toHaveLength(2)
|
||||||
|
expect(results[0].title).toBe('Real Result')
|
||||||
|
expect(results[1].title).toBe('Another Result')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('skips b_algo blocks without h2 > a structure', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<div>No link here</div>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Valid Result</a></h2>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results).toHaveLength(1)
|
||||||
|
expect(results[0].title).toBe('Valid Result')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('handles extra whitespace in h2 > a structure', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2>
|
||||||
|
<a href="https://example.com" h="ID=SERP,1" >
|
||||||
|
Whitespace Title
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results).toHaveLength(1)
|
||||||
|
expect(results[0].title).toBe('Whitespace Title')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('handles snippet with HTML entities', () => {
|
||||||
|
const html = `
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com">Title</a></h2>
|
||||||
|
<p class="b_lineclamp">5 < 10 & 10 > 5</p>
|
||||||
|
</li>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results[0].snippet).toBe('5 < 10 & 10 > 5')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('handles real-world Bing HTML structure', () => {
|
||||||
|
const html = `
|
||||||
|
<ol id="b_results" role="main">
|
||||||
|
<li class="b_algo" data-id="">
|
||||||
|
<div class="b_title">
|
||||||
|
<h2>
|
||||||
|
<a href="https://docs.python.org/3/tutorial/index.html" target="_blank" h="ID=SERP,5125.1">
|
||||||
|
Python Tutorial
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
</div>
|
||||||
|
<div class="b_caption">
|
||||||
|
<div class="b_attribution" u="0|5125|4976674477245">
|
||||||
|
<cite>https://docs.python.org</cite>
|
||||||
|
</div>
|
||||||
|
<p class="b_lineclamp3">
|
||||||
|
Welcome to the Python Tutorial. This tutorial introduces you to the basic concepts and features...
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2>
|
||||||
|
<a href="https://realpython.com/python-guide/" h="ID=SERP,5125.2">
|
||||||
|
Real Python Guide
|
||||||
|
</a>
|
||||||
|
</h2>
|
||||||
|
<div class="b_caption">
|
||||||
|
<div class="b_attribution">
|
||||||
|
<cite>https://realpython.com</cite>
|
||||||
|
</div>
|
||||||
|
<p>
|
||||||
|
The ultimate Python guide for beginners and experts alike.
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
const results = extractBingResults(html)
|
||||||
|
expect(results).toHaveLength(2)
|
||||||
|
expect(results[0].title).toBe('Python Tutorial')
|
||||||
|
expect(results[0].url).toBe('https://docs.python.org/3/tutorial/index.html')
|
||||||
|
expect(results[0].snippet).toContain('Welcome to the Python Tutorial')
|
||||||
|
expect(results[1].title).toBe('Real Python Guide')
|
||||||
|
expect(results[1].snippet).toContain('ultimate Python guide')
|
||||||
|
})
|
||||||
|
})
|
||||||
|
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
// BingSearchAdapter.search (integration with mocked axios)
|
||||||
|
// ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
describe('BingSearchAdapter.search', () => {
|
||||||
|
// Dynamic import so mock.module() takes effect
|
||||||
|
const createAdapter = async () => {
|
||||||
|
const { BingSearchAdapter } = await import('../adapters/bingAdapter')
|
||||||
|
return new BingSearchAdapter()
|
||||||
|
}
|
||||||
|
|
||||||
|
const SAMPLE_HTML = `
|
||||||
|
<ol id="b_results">
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com/result1">Result One</a></h2>
|
||||||
|
<p class="b_lineclamp">Snippet one</p>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://example.com/result2">Result Two</a></h2>
|
||||||
|
<p class="b_lineclamp">Snippet two</p>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
|
||||||
|
test('returns parsed results from fetched HTML', async () => {
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: mock(() => Promise.resolve({ data: SAMPLE_HTML })),
|
||||||
|
isCancel: () => false,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
const results = await adapter.search('test query', {})
|
||||||
|
expect(results).toHaveLength(2)
|
||||||
|
expect(results[0].title).toBe('Result One')
|
||||||
|
expect(results[1].title).toBe('Result Two')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('calls onProgress with query_update and search_results_received', async () => {
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: mock(() => Promise.resolve({ data: SAMPLE_HTML })),
|
||||||
|
isCancel: () => false,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const progressCalls: any[] = []
|
||||||
|
const onProgress = (p: any) => progressCalls.push(p)
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
await adapter.search('test', { onProgress })
|
||||||
|
|
||||||
|
expect(progressCalls).toHaveLength(2)
|
||||||
|
expect(progressCalls[0].type).toBe('query_update')
|
||||||
|
expect(progressCalls[0].query).toBe('test')
|
||||||
|
expect(progressCalls[1].type).toBe('search_results_received')
|
||||||
|
expect(progressCalls[1].resultCount).toBe(2)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('filters results by allowedDomains', async () => {
|
||||||
|
const mixedHtml = `
|
||||||
|
<ol id="b_results">
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://allowed.com/a">Allowed Result</a></h2>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://blocked.com/b">Blocked Result</a></h2>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: mock(() => Promise.resolve({ data: mixedHtml })),
|
||||||
|
isCancel: () => false,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
const results = await adapter.search('test', {
|
||||||
|
allowedDomains: ['allowed.com'],
|
||||||
|
})
|
||||||
|
expect(results).toHaveLength(1)
|
||||||
|
expect(results[0].url).toBe('https://allowed.com/a')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('filters results by blockedDomains', async () => {
|
||||||
|
const mixedHtml = `
|
||||||
|
<ol id="b_results">
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://good.com/a">Good Result</a></h2>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://spam.com/b">Spam Result</a></h2>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: mock(() => Promise.resolve({ data: mixedHtml })),
|
||||||
|
isCancel: () => false,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
const results = await adapter.search('test', {
|
||||||
|
blockedDomains: ['spam.com'],
|
||||||
|
})
|
||||||
|
expect(results).toHaveLength(1)
|
||||||
|
expect(results[0].url).toBe('https://good.com/a')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('filters subdomains with allowedDomains', async () => {
|
||||||
|
const html = `
|
||||||
|
<ol id="b_results">
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://docs.example.com/page">Subdomain Result</a></h2>
|
||||||
|
</li>
|
||||||
|
<li class="b_algo">
|
||||||
|
<h2><a href="https://other.com/page">Other Result</a></h2>
|
||||||
|
</li>
|
||||||
|
</ol>
|
||||||
|
`
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: mock(() => Promise.resolve({ data: html })),
|
||||||
|
isCancel: () => false,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
const results = await adapter.search('test', {
|
||||||
|
allowedDomains: ['example.com'],
|
||||||
|
})
|
||||||
|
expect(results).toHaveLength(1)
|
||||||
|
expect(results[0].url).toBe('https://docs.example.com/page')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('throws AbortError when signal is already aborted', async () => {
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: mock((_url: string, config: any) => {
|
||||||
|
if (config?.signal?.aborted) {
|
||||||
|
const err = new Error('canceled')
|
||||||
|
;(err as any).__CANCEL__ = true
|
||||||
|
return Promise.reject(err)
|
||||||
|
}
|
||||||
|
return Promise.resolve({ data: SAMPLE_HTML })
|
||||||
|
}),
|
||||||
|
isCancel: (e: any) => e?.__CANCEL__ === true,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
const controller = new AbortController()
|
||||||
|
controller.abort()
|
||||||
|
|
||||||
|
const { AbortError } = await import('../../../utils/errors')
|
||||||
|
await expect(
|
||||||
|
adapter.search('test', { signal: controller.signal }),
|
||||||
|
).rejects.toThrow(AbortError)
|
||||||
|
})
|
||||||
|
|
||||||
|
test('re-throws non-abort axios errors', async () => {
|
||||||
|
const networkError = new Error('Network error')
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: mock(() => Promise.reject(networkError)),
|
||||||
|
isCancel: () => false,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
await expect(adapter.search('test', {})).rejects.toThrow('Network error')
|
||||||
|
})
|
||||||
|
|
||||||
|
test('encodes query parameter in URL', async () => {
|
||||||
|
const axiosGet = mock(() => Promise.resolve({ data: SAMPLE_HTML }))
|
||||||
|
mock.module('axios', () => ({
|
||||||
|
default: {
|
||||||
|
get: axiosGet,
|
||||||
|
isCancel: () => false,
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
mock.module('../../../utils/http', () => ({
|
||||||
|
getWebFetchUserAgent: () => 'TestAgent/1.0',
|
||||||
|
}))
|
||||||
|
|
||||||
|
const adapter = await createAdapter()
|
||||||
|
await adapter.search('hello world & special=chars', {})
|
||||||
|
|
||||||
|
const calledUrl = axiosGet.mock.calls[0][0] as string
|
||||||
|
expect(calledUrl).toContain('q=hello%20world%20%26%20special%3Dchars')
|
||||||
|
})
|
||||||
|
})
|
||||||
173
src/tools/WebSearchTool/adapters/apiAdapter.ts
Normal file
173
src/tools/WebSearchTool/adapters/apiAdapter.ts
Normal file
@@ -0,0 +1,173 @@
|
|||||||
|
/**
|
||||||
|
* API-based search adapter — delegates to Anthropic's server-side
|
||||||
|
* web_search_20250305 tool via a secondary API call.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import type {
|
||||||
|
BetaContentBlock,
|
||||||
|
BetaWebSearchTool20250305,
|
||||||
|
} from '@anthropic-ai/sdk/resources/beta/messages/messages.mjs'
|
||||||
|
import { getFeatureValue_CACHED_MAY_BE_STALE } from '../../../services/analytics/growthbook.js'
|
||||||
|
import { queryModelWithStreaming } from '../../../services/api/claude.js'
|
||||||
|
import { createUserMessage } from '../../../utils/messages.js'
|
||||||
|
import { getMainLoopModel, getSmallFastModel } from '../../../utils/model/model.js'
|
||||||
|
import { jsonParse } from '../../../utils/slowOperations.js'
|
||||||
|
import { asSystemPrompt } from '../../../utils/systemPromptType.js'
|
||||||
|
import type { SearchResult, SearchOptions, WebSearchAdapter } from './types.js'
|
||||||
|
|
||||||
|
function makeToolSchema(input: { allowedDomains?: string[]; blockedDomains?: string[] }): BetaWebSearchTool20250305 {
|
||||||
|
return {
|
||||||
|
type: 'web_search_20250305',
|
||||||
|
name: 'web_search',
|
||||||
|
allowed_domains: input.allowedDomains,
|
||||||
|
blocked_domains: input.blockedDomains,
|
||||||
|
max_uses: 8,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
export class ApiSearchAdapter implements WebSearchAdapter {
|
||||||
|
async search(
|
||||||
|
query: string,
|
||||||
|
options: SearchOptions,
|
||||||
|
): Promise<SearchResult[]> {
|
||||||
|
const { signal, onProgress, allowedDomains, blockedDomains } = options
|
||||||
|
|
||||||
|
const userMessage = createUserMessage({
|
||||||
|
content: 'Perform a web search for the query: ' + query,
|
||||||
|
})
|
||||||
|
const toolSchema = makeToolSchema({ allowedDomains, blockedDomains })
|
||||||
|
|
||||||
|
const useHaiku = getFeatureValue_CACHED_MAY_BE_STALE('tengu_plum_vx3', false)
|
||||||
|
|
||||||
|
const queryStream = queryModelWithStreaming({
|
||||||
|
messages: [userMessage],
|
||||||
|
systemPrompt: asSystemPrompt([
|
||||||
|
'You are an assistant for performing a web search tool use',
|
||||||
|
]),
|
||||||
|
thinkingConfig: useHaiku
|
||||||
|
? { type: 'disabled' as const }
|
||||||
|
: { type: 'enabled' as const, budgetTokens: 10000 },
|
||||||
|
tools: [],
|
||||||
|
signal: signal ?? new AbortController().signal,
|
||||||
|
options: {
|
||||||
|
getToolPermissionContext: async () => ({
|
||||||
|
mode: 'default' as const,
|
||||||
|
additionalWorkingDirectories: new Map(),
|
||||||
|
alwaysAllowRules: {},
|
||||||
|
alwaysDenyRules: {},
|
||||||
|
alwaysAskRules: {},
|
||||||
|
isBypassPermissionsModeAvailable: false,
|
||||||
|
}),
|
||||||
|
model: useHaiku ? getSmallFastModel() : getMainLoopModel(),
|
||||||
|
toolChoice: useHaiku ? { type: 'tool' as const, name: 'web_search' } : undefined,
|
||||||
|
isNonInteractiveSession: false,
|
||||||
|
hasAppendSystemPrompt: false,
|
||||||
|
extraToolSchemas: [toolSchema],
|
||||||
|
querySource: 'web_search_tool' as const,
|
||||||
|
agents: [],
|
||||||
|
mcpTools: [],
|
||||||
|
agentId: undefined,
|
||||||
|
effortValue: undefined,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
|
||||||
|
const allContentBlocks: BetaContentBlock[] = []
|
||||||
|
let currentToolUseId: string | null = null
|
||||||
|
let currentToolUseJson = ''
|
||||||
|
const toolUseQueries = new Map<string, string>()
|
||||||
|
let progressCounter = 0
|
||||||
|
|
||||||
|
for await (const event of queryStream) {
|
||||||
|
if (event.type === 'assistant') {
|
||||||
|
const msg = event as { message: { content: BetaContentBlock[] } }
|
||||||
|
allContentBlocks.push(...msg.message.content)
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
|
||||||
|
if (event.type === 'stream_event') {
|
||||||
|
const streamEvt = event as {
|
||||||
|
event?: {
|
||||||
|
type: string
|
||||||
|
content_block?: { type: string; id?: string; tool_use_id?: string; content?: unknown; [key: string]: unknown }
|
||||||
|
delta?: { type: string; partial_json?: string; [key: string]: unknown }
|
||||||
|
[key: string]: unknown
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (streamEvt.event?.type === 'content_block_start') {
|
||||||
|
const contentBlock = streamEvt.event.content_block
|
||||||
|
if (contentBlock && contentBlock.type === 'server_tool_use') {
|
||||||
|
currentToolUseId = contentBlock.id as string
|
||||||
|
currentToolUseJson = ''
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentToolUseId && streamEvt.event?.type === 'content_block_delta') {
|
||||||
|
const delta = streamEvt.event.delta
|
||||||
|
if (delta?.type === 'input_json_delta' && delta.partial_json) {
|
||||||
|
currentToolUseJson += delta.partial_json
|
||||||
|
try {
|
||||||
|
const queryMatch = currentToolUseJson.match(
|
||||||
|
/"query"\s*:\s*"((?:[^"\\]|\\.)*)"/,
|
||||||
|
)
|
||||||
|
if (queryMatch && queryMatch[1]) {
|
||||||
|
const parsedQuery = jsonParse('"' + queryMatch[1] + '"')
|
||||||
|
if (
|
||||||
|
!toolUseQueries.has(currentToolUseId) ||
|
||||||
|
toolUseQueries.get(currentToolUseId) !== parsedQuery
|
||||||
|
) {
|
||||||
|
toolUseQueries.set(currentToolUseId, parsedQuery)
|
||||||
|
progressCounter++
|
||||||
|
onProgress?.({
|
||||||
|
type: 'query_update',
|
||||||
|
query: parsedQuery,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
// Ignore parsing errors for partial JSON
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (streamEvt.event?.type === 'content_block_start') {
|
||||||
|
const contentBlock = streamEvt.event.content_block
|
||||||
|
if (contentBlock && contentBlock.type === 'web_search_tool_result') {
|
||||||
|
const toolUseId = contentBlock.tool_use_id as string
|
||||||
|
const actualQuery = toolUseQueries.get(toolUseId) || query
|
||||||
|
const content = contentBlock.content
|
||||||
|
progressCounter++
|
||||||
|
onProgress?.({
|
||||||
|
type: 'search_results_received',
|
||||||
|
resultCount: Array.isArray(content) ? content.length : 0,
|
||||||
|
query: actualQuery,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract SearchResult[] from content blocks
|
||||||
|
return extractSearchResults(allContentBlocks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractSearchResults(
|
||||||
|
blocks: BetaContentBlock[],
|
||||||
|
): SearchResult[] {
|
||||||
|
const results: SearchResult[] = []
|
||||||
|
|
||||||
|
for (const block of blocks) {
|
||||||
|
if (block.type === 'web_search_tool_result' && Array.isArray(block.content)) {
|
||||||
|
for (const r of block.content as Array<{ title: string; url: string; page_age?: string; type?: string }>) {
|
||||||
|
results.push({
|
||||||
|
title: r.title,
|
||||||
|
url: r.url,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
}
|
||||||
204
src/tools/WebSearchTool/adapters/bingAdapter.ts
Normal file
204
src/tools/WebSearchTool/adapters/bingAdapter.ts
Normal file
@@ -0,0 +1,204 @@
|
|||||||
|
/**
|
||||||
|
* Bing-based search adapter — fetches Bing search pages and extracts
|
||||||
|
* search results using regex pattern matching on raw HTML.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import axios from 'axios'
|
||||||
|
import he from 'he'
|
||||||
|
import { AbortError } from '../../../utils/errors.js'
|
||||||
|
import type { SearchResult, SearchOptions, WebSearchAdapter } from './types.js'
|
||||||
|
|
||||||
|
const FETCH_TIMEOUT_MS = 30_000
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Browser-like headers to avoid Bing's anti-bot JS-rendered response.
|
||||||
|
* These mimic Microsoft Edge on macOS to get full HTML search results.
|
||||||
|
*/
|
||||||
|
const BROWSER_HEADERS = {
|
||||||
|
'User-Agent':
|
||||||
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
|
||||||
|
Accept:
|
||||||
|
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
||||||
|
'Accept-Language': 'en-US,en;q=0.9',
|
||||||
|
'Accept-Encoding': 'gzip, deflate, br',
|
||||||
|
'Cache-Control': 'no-cache',
|
||||||
|
Pragma: 'no-cache',
|
||||||
|
'Sec-Ch-Ua': '"Microsoft Edge";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
|
||||||
|
'Sec-Ch-Ua-Mobile': '?0',
|
||||||
|
'Sec-Ch-Ua-Platform': '"macOS"',
|
||||||
|
'Sec-Fetch-Dest': 'document',
|
||||||
|
'Sec-Fetch-Mode': 'navigate',
|
||||||
|
'Sec-Fetch-Site': 'none',
|
||||||
|
'Sec-Fetch-User': '?1',
|
||||||
|
'Upgrade-Insecure-Requests': '1',
|
||||||
|
} as const
|
||||||
|
|
||||||
|
export class BingSearchAdapter implements WebSearchAdapter {
|
||||||
|
async search(
|
||||||
|
query: string,
|
||||||
|
options: SearchOptions,
|
||||||
|
): Promise<SearchResult[]> {
|
||||||
|
const { signal, onProgress, allowedDomains, blockedDomains } = options
|
||||||
|
|
||||||
|
if (signal?.aborted) {
|
||||||
|
throw new AbortError()
|
||||||
|
}
|
||||||
|
|
||||||
|
onProgress?.({ type: 'query_update', query })
|
||||||
|
|
||||||
|
const url = `https://www.bing.com/search?q=${encodeURIComponent(query)}&setmkt=en-US`
|
||||||
|
|
||||||
|
const abortController = new AbortController()
|
||||||
|
if (signal) {
|
||||||
|
signal.addEventListener('abort', () => abortController.abort(), { once: true })
|
||||||
|
}
|
||||||
|
|
||||||
|
let html: string
|
||||||
|
try {
|
||||||
|
const response = await axios.get(url, {
|
||||||
|
signal: abortController.signal,
|
||||||
|
timeout: FETCH_TIMEOUT_MS,
|
||||||
|
responseType: 'text',
|
||||||
|
headers: BROWSER_HEADERS,
|
||||||
|
})
|
||||||
|
html = response.data
|
||||||
|
} catch (e) {
|
||||||
|
if (axios.isCancel(e) || abortController.signal.aborted) {
|
||||||
|
throw new AbortError()
|
||||||
|
}
|
||||||
|
throw e
|
||||||
|
}
|
||||||
|
|
||||||
|
if (abortController.signal.aborted) {
|
||||||
|
throw new AbortError()
|
||||||
|
}
|
||||||
|
|
||||||
|
const rawResults = extractBingResults(html)
|
||||||
|
|
||||||
|
// Client-side domain filtering
|
||||||
|
const results = rawResults.filter((r) => {
|
||||||
|
if (!r.url) return false
|
||||||
|
try {
|
||||||
|
const hostname = new URL(r.url).hostname
|
||||||
|
if (allowedDomains?.length && !allowedDomains.some(d => hostname === d || hostname.endsWith('.' + d))) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
if (blockedDomains?.length && blockedDomains.some(d => hostname === d || hostname.endsWith('.' + d))) {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
} catch {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
return true
|
||||||
|
})
|
||||||
|
|
||||||
|
onProgress?.({
|
||||||
|
type: 'search_results_received',
|
||||||
|
resultCount: results.length,
|
||||||
|
query,
|
||||||
|
})
|
||||||
|
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract organic search results from Bing HTML.
|
||||||
|
* Bing results live in <li class="b_algo"> blocks within <ol id="b_results">.
|
||||||
|
*/
|
||||||
|
export function extractBingResults(html: string): SearchResult[] {
|
||||||
|
const results: SearchResult[] = []
|
||||||
|
|
||||||
|
const algoBlockRegex = /<li\s+class="b_algo"[^>]*>([\s\S]*?)<\/li>/gi
|
||||||
|
let blockMatch: RegExpExecArray | null
|
||||||
|
|
||||||
|
while ((blockMatch = algoBlockRegex.exec(html)) !== null) {
|
||||||
|
const block = blockMatch[1]
|
||||||
|
|
||||||
|
// Extract the primary link from <h2><a href="...">...</a></h2>
|
||||||
|
const h2LinkRegex = /<h2[^>]*>\s*<a[^>]+href="([^"]+)"[^>]*>([\s\S]*?)<\/a>/i
|
||||||
|
const linkMatch = h2LinkRegex.exec(block)
|
||||||
|
if (!linkMatch) continue
|
||||||
|
|
||||||
|
const rawUrl = decodeHtmlEntities(linkMatch[1])
|
||||||
|
const titleHtml = linkMatch[2]
|
||||||
|
|
||||||
|
// Resolve Bing redirect URLs (bing.com/ck/a?...&u=a1aHR0cHM6Ly9...)
|
||||||
|
// or skip Bing-internal / relative links
|
||||||
|
const url = resolveBingUrl(rawUrl)
|
||||||
|
if (!url) continue
|
||||||
|
|
||||||
|
const title = decodeHtmlEntities(
|
||||||
|
titleHtml.replace(/<[^>]+>/g, '').trim(),
|
||||||
|
)
|
||||||
|
|
||||||
|
// Extract snippet: try b_lineclamp → b_caption <p> → b_caption fallback
|
||||||
|
const snippet = extractSnippet(block)
|
||||||
|
|
||||||
|
results.push({ title, url, snippet })
|
||||||
|
}
|
||||||
|
|
||||||
|
return results
|
||||||
|
}
|
||||||
|
|
||||||
|
function extractSnippet(block: string): string | undefined {
|
||||||
|
// 1. Try <p class="b_lineclamp...">
|
||||||
|
const lineclampRegex = /<p[^>]*class="b_lineclamp[^"]*"[^>]*>([\s\S]*?)<\/p>/i
|
||||||
|
let match = lineclampRegex.exec(block)
|
||||||
|
if (match) {
|
||||||
|
return decodeHtmlEntities(match[1].replace(/<[^>]+>/g, '').trim())
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Try <p> inside b_caption
|
||||||
|
const captionPRegex = /<div[^>]*class="b_caption[^"]*"[^>]*>[\s\S]*?<p[^>]*>([\s\S]*?)<\/p>/i
|
||||||
|
match = captionPRegex.exec(block)
|
||||||
|
if (match) {
|
||||||
|
return decodeHtmlEntities(match[1].replace(/<[^>]+>/g, '').trim())
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Fallback: any text inside b_caption <div>
|
||||||
|
const fallbackRegex = /<div[^>]*class="b_caption[^"]*"[^>]*>([\s\S]*?)<\/div>/i
|
||||||
|
const fallbackMatch = fallbackRegex.exec(block)
|
||||||
|
if (fallbackMatch) {
|
||||||
|
const text = fallbackMatch[1].replace(/<[^>]+>/g, '').trim()
|
||||||
|
if (text) return decodeHtmlEntities(text)
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
|
|
||||||
|
export const decodeHtmlEntities = he.decode
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve a Bing redirect URL to the actual target URL.
|
||||||
|
* Bing uses URLs like: https://www.bing.com/ck/a?...&u=a1aHR0cHM6Ly9leGFtcGxlLmNvbQ...
|
||||||
|
* The `u` query parameter is a base64-encoded URL prefixed with a1 (https) or a0 (http).
|
||||||
|
* Returns `undefined` for Bing-internal or relative links that should be skipped.
|
||||||
|
*/
|
||||||
|
export function resolveBingUrl(rawUrl: string): string | undefined {
|
||||||
|
// Skip relative / anchor links
|
||||||
|
if (rawUrl.startsWith('/') || rawUrl.startsWith('#')) return undefined
|
||||||
|
|
||||||
|
// Try to extract the `u` parameter from Bing redirect URLs
|
||||||
|
const uMatch = rawUrl.match(/[?&]u=([a-zA-Z0-9+/_=-]+)/)
|
||||||
|
if (uMatch) {
|
||||||
|
const encoded = uMatch[1]
|
||||||
|
if (encoded.length >= 3) {
|
||||||
|
const prefix = encoded.slice(0, 2)
|
||||||
|
const b64 = encoded.slice(2)
|
||||||
|
try {
|
||||||
|
// Base64url decode (pad as needed)
|
||||||
|
const padded = b64.replace(/-/g, '+').replace(/_/g, '/')
|
||||||
|
const decoded = Buffer.from(padded, 'base64').toString('utf-8')
|
||||||
|
if (decoded.startsWith('http')) return decoded
|
||||||
|
} catch {
|
||||||
|
// Fall through — not a valid base64 redirect
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Direct external URL (not a Bing-internal page)
|
||||||
|
if (!rawUrl.includes('bing.com')) return rawUrl
|
||||||
|
|
||||||
|
return undefined
|
||||||
|
}
|
||||||
41
src/tools/WebSearchTool/adapters/index.ts
Normal file
41
src/tools/WebSearchTool/adapters/index.ts
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
/**
|
||||||
|
* Search adapter factory — selects the appropriate backend by checking
|
||||||
|
* whether the API base URL points to Anthropic's official endpoint.
|
||||||
|
*/
|
||||||
|
|
||||||
|
import { isFirstPartyAnthropicBaseUrl } from '../../../utils/model/providers.js'
|
||||||
|
import { ApiSearchAdapter } from './apiAdapter.js'
|
||||||
|
import { BingSearchAdapter } from './bingAdapter.js'
|
||||||
|
import type { WebSearchAdapter } from './types.js'
|
||||||
|
|
||||||
|
export type { SearchResult, SearchOptions, SearchProgress, WebSearchAdapter } from './types.js'
|
||||||
|
|
||||||
|
let cachedAdapter: WebSearchAdapter | null = null
|
||||||
|
|
||||||
|
export function createAdapter(): WebSearchAdapter {
|
||||||
|
// 直接用 bing 适配器,跳过 API 适配器的选择逻辑
|
||||||
|
return new BingSearchAdapter()
|
||||||
|
// // Adapter is stateless — safe to reuse across calls within a session
|
||||||
|
// if (cachedAdapter) return cachedAdapter
|
||||||
|
|
||||||
|
// // Env override: WEB_SEARCH_ADAPTER=api|bing forces specific backend
|
||||||
|
// const envAdapter = process.env.WEB_SEARCH_ADAPTER
|
||||||
|
// if (envAdapter === 'api') {
|
||||||
|
// cachedAdapter = new ApiSearchAdapter()
|
||||||
|
// return cachedAdapter
|
||||||
|
// }
|
||||||
|
// if (envAdapter === 'bing') {
|
||||||
|
// cachedAdapter = new BingSearchAdapter()
|
||||||
|
// return cachedAdapter
|
||||||
|
// }
|
||||||
|
|
||||||
|
// // Anthropic official URL → API server-side search
|
||||||
|
// if (isFirstPartyAnthropicBaseUrl()) {
|
||||||
|
// cachedAdapter = new ApiSearchAdapter()
|
||||||
|
// return cachedAdapter
|
||||||
|
// }
|
||||||
|
|
||||||
|
// // Third-party proxies / non-Anthropic endpoints → Bing fallback
|
||||||
|
// cachedAdapter = new BingSearchAdapter()
|
||||||
|
// return cachedAdapter
|
||||||
|
}
|
||||||
22
src/tools/WebSearchTool/adapters/types.ts
Normal file
22
src/tools/WebSearchTool/adapters/types.ts
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
export interface SearchResult {
|
||||||
|
title: string
|
||||||
|
url: string
|
||||||
|
snippet?: string
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SearchOptions {
|
||||||
|
allowedDomains?: string[]
|
||||||
|
blockedDomains?: string[]
|
||||||
|
signal?: AbortSignal
|
||||||
|
onProgress?: (progress: SearchProgress) => void
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface SearchProgress {
|
||||||
|
type: 'query_update' | 'search_results_received'
|
||||||
|
query?: string
|
||||||
|
resultCount?: number
|
||||||
|
}
|
||||||
|
|
||||||
|
export interface WebSearchAdapter {
|
||||||
|
search(query: string, options: SearchOptions): Promise<SearchResult[]>
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user