mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-15 12:55:51 +00:00
- braveAdapter: 读取 settings.braveApiKey (优先于环境变量) - webFetch utils: getFetchTimeoutMs() 统一读取 settings.webFetchHttpTimeoutMs,HTTP/Tavily 两条路径均生效 - tavilyAdapter: 自定义端点自动追加 /search 路径(与 fetchContentWithTavily 一致) Co-Authored-By: deepseek-v4-pro <deepseek-ai@claude-code-best.win>
594 lines
18 KiB
TypeScript
594 lines
18 KiB
TypeScript
import axios, { type AxiosResponse } from 'axios'
|
|
import { LRUCache } from 'lru-cache'
|
|
import {
|
|
type AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
logEvent,
|
|
} from 'src/services/analytics/index.js'
|
|
import { queryHaiku } from 'src/services/api/claude.js'
|
|
import { AbortError } from 'src/utils/errors.js'
|
|
import { getWebFetchUserAgent } from 'src/utils/http.js'
|
|
import { logError } from 'src/utils/log.js'
|
|
import {
|
|
isBinaryContentType,
|
|
persistBinaryContent,
|
|
} from 'src/utils/mcpOutputStorage.js'
|
|
import { getSettings_DEPRECATED } from 'src/utils/settings/settings.js'
|
|
import { asSystemPrompt } from 'src/utils/systemPromptType.js'
|
|
import { isPreapprovedHost } from './preapproved.js'
|
|
import { makeSecondaryModelPrompt } from './prompt.js'
|
|
|
|
const DEFAULT_TAVILY_EXTRACT_URL = 'https://tavily.claude-code-best.win/extract'
|
|
|
|
// Custom error class for egress proxy blocks
|
|
class EgressBlockedError extends Error {
|
|
constructor(public readonly domain: string) {
|
|
super(
|
|
JSON.stringify({
|
|
error_type: 'EGRESS_BLOCKED',
|
|
domain,
|
|
message: `Access to ${domain} is blocked by the network egress proxy.`,
|
|
}),
|
|
)
|
|
this.name = 'EgressBlockedError'
|
|
}
|
|
}
|
|
|
|
// Cache for storing fetched URL content
|
|
type CacheEntry = {
|
|
bytes: number
|
|
code: number
|
|
codeText: string
|
|
content: string
|
|
contentType: string
|
|
persistedPath?: string
|
|
persistedSize?: number
|
|
}
|
|
|
|
// Cache with 15-minute TTL and 50MB size limit
|
|
// LRUCache handles automatic expiration and eviction
|
|
const CACHE_TTL_MS = 15 * 60 * 1000 // 15 minutes
|
|
const MAX_CACHE_SIZE_BYTES = 50 * 1024 * 1024 // 50MB
|
|
|
|
const URL_CACHE = new LRUCache<string, CacheEntry>({
|
|
maxSize: MAX_CACHE_SIZE_BYTES,
|
|
ttl: CACHE_TTL_MS,
|
|
})
|
|
|
|
export function clearWebFetchCache(): void {
|
|
URL_CACHE.clear()
|
|
}
|
|
|
|
function responseHeaderToString(value: unknown): string | undefined {
|
|
if (typeof value === 'string') {
|
|
return value
|
|
}
|
|
if (Array.isArray(value)) {
|
|
const parts = value
|
|
.map(responseHeaderToString)
|
|
.filter((part): part is string => part !== undefined)
|
|
return parts.length > 0 ? parts.join(', ') : undefined
|
|
}
|
|
return undefined
|
|
}
|
|
|
|
function getResponseHeader(
|
|
headers: AxiosResponse<unknown>['headers'],
|
|
name: string,
|
|
): string | undefined {
|
|
const headersWithGet = headers as { get?: (headerName: string) => unknown }
|
|
if (typeof headersWithGet.get === 'function') {
|
|
const value = responseHeaderToString(headersWithGet.get(name))
|
|
if (value !== undefined) {
|
|
return value
|
|
}
|
|
}
|
|
|
|
return responseHeaderToString(headers[name.toLowerCase()])
|
|
}
|
|
|
|
// Lazy singleton — defers the turndown → @mixmark-io/domino import (~1.4MB
|
|
// retained heap) until the first HTML fetch, and reuses one instance across
|
|
// calls (construction builds 15 rule objects; .turndown() is stateless).
|
|
// @types/turndown ships only `export =` (no .d.mts), so TS types the import
|
|
// as the class itself while Bun wraps CJS in { default } — hence the cast.
|
|
type TurndownCtor = typeof import('turndown')
|
|
let turndownServicePromise: Promise<InstanceType<TurndownCtor>> | undefined
|
|
function getTurndownService(): Promise<InstanceType<TurndownCtor>> {
|
|
return (turndownServicePromise ??= import('turndown').then(m => {
|
|
const Turndown = (m as unknown as { default: TurndownCtor }).default
|
|
return new Turndown()
|
|
}))
|
|
}
|
|
|
|
// PSR requested limiting the length of URLs to 250 to lower the potential
|
|
// for a data exfiltration. However, this is too restrictive for some customers'
|
|
// legitimate use cases, such as JWT-signed URLs (e.g., cloud service signed URLs)
|
|
// that can be much longer. We already require user approval for each domain,
|
|
// which provides a primary security boundary. In addition, Claude Code has
|
|
// other data exfil channels, and this one does not seem relatively high risk,
|
|
// so I'm removing that length restriction. -ab
|
|
const MAX_URL_LENGTH = 2000
|
|
|
|
// Per PSR:
|
|
// "Implement resource consumption controls because setting limits on CPU,
|
|
// memory, and network usage for the Web Fetch tool can prevent a single
|
|
// request or user from overwhelming the system."
|
|
const MAX_HTTP_CONTENT_LENGTH = 10 * 1024 * 1024
|
|
|
|
// Timeout for the main HTTP fetch request (60 seconds).
|
|
// Prevents hanging indefinitely on slow/unresponsive servers.
|
|
// Overridable via settings.webFetchHttpTimeoutMs (set in /web-tools panel).
|
|
const DEFAULT_FETCH_TIMEOUT_MS = 60_000
|
|
|
|
function getFetchTimeoutMs(): number {
|
|
const settings = getSettings_DEPRECATED() as Record<string, unknown> & {
|
|
webFetchHttpTimeoutMs?: number
|
|
}
|
|
return settings.webFetchHttpTimeoutMs ?? DEFAULT_FETCH_TIMEOUT_MS
|
|
}
|
|
|
|
// Cap same-host redirect hops. Without this a malicious server can return
|
|
// a redirect loop (/a → /b → /a …) and the per-request timeout
|
|
// (controlled by settings.webFetchHttpTimeoutMs)
|
|
// resets on every hop, hanging the tool until user interrupt. 10 matches
|
|
// common client defaults (axios=5, follow-redirects=21, Chrome=20).
|
|
const MAX_REDIRECTS = 10
|
|
|
|
// Truncate to not spend too many tokens
|
|
export const MAX_MARKDOWN_LENGTH = 100_000
|
|
|
|
export function isPreapprovedUrl(url: string): boolean {
|
|
try {
|
|
const parsedUrl = new URL(url)
|
|
return isPreapprovedHost(parsedUrl.hostname, parsedUrl.pathname)
|
|
} catch {
|
|
return false
|
|
}
|
|
}
|
|
|
|
export function validateURL(url: string): boolean {
|
|
if (url.length > MAX_URL_LENGTH) {
|
|
return false
|
|
}
|
|
|
|
let parsed
|
|
try {
|
|
parsed = new URL(url)
|
|
} catch {
|
|
return false
|
|
}
|
|
|
|
// We don't need to check protocol here, as we'll upgrade http to https when making the request
|
|
|
|
// As long as we aren't supporting aiming to cookies or internal domains,
|
|
// we should block URLs with usernames/passwords too, even though these
|
|
// seem exceedingly unlikely.
|
|
if (parsed.username || parsed.password) {
|
|
return false
|
|
}
|
|
|
|
// Initial filter that this isn't a privileged, company-internal URL
|
|
// by checking that the hostname is publicly resolvable
|
|
const hostname = parsed.hostname
|
|
const parts = hostname.split('.')
|
|
if (parts.length < 2) {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
}
|
|
|
|
/**
|
|
* Check if a redirect is safe to follow
|
|
* Allows redirects that:
|
|
* - Add or remove "www." in the hostname
|
|
* - Keep the origin the same but change path/query params
|
|
* - Or both of the above
|
|
*/
|
|
export function isPermittedRedirect(
|
|
originalUrl: string,
|
|
redirectUrl: string,
|
|
): boolean {
|
|
try {
|
|
const parsedOriginal = new URL(originalUrl)
|
|
const parsedRedirect = new URL(redirectUrl)
|
|
|
|
if (parsedRedirect.protocol !== parsedOriginal.protocol) {
|
|
return false
|
|
}
|
|
|
|
if (parsedRedirect.port !== parsedOriginal.port) {
|
|
return false
|
|
}
|
|
|
|
if (parsedRedirect.username || parsedRedirect.password) {
|
|
return false
|
|
}
|
|
|
|
// Now check hostname conditions
|
|
// 1. Adding www. is allowed: example.com -> www.example.com
|
|
// 2. Removing www. is allowed: www.example.com -> example.com
|
|
// 3. Same host (with or without www.) is allowed: paths can change
|
|
const stripWww = (hostname: string) => hostname.replace(/^www\./, '')
|
|
const originalHostWithoutWww = stripWww(parsedOriginal.hostname)
|
|
const redirectHostWithoutWww = stripWww(parsedRedirect.hostname)
|
|
return originalHostWithoutWww === redirectHostWithoutWww
|
|
} catch (_error) {
|
|
return false
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Helper function to handle fetching URLs with custom redirect handling
|
|
* Recursively follows redirects if they pass the redirectChecker function
|
|
*
|
|
* Per PSR:
|
|
* "Do not automatically follow redirects because following redirects could
|
|
* allow for an attacker to exploit an open redirect vulnerability in a
|
|
* trusted domain to force a user to make a request to a malicious domain
|
|
* unknowingly"
|
|
*/
|
|
type RedirectInfo = {
|
|
type: 'redirect'
|
|
originalUrl: string
|
|
redirectUrl: string
|
|
statusCode: number
|
|
}
|
|
|
|
export async function getWithPermittedRedirects(
|
|
url: string,
|
|
signal: AbortSignal,
|
|
redirectChecker: (originalUrl: string, redirectUrl: string) => boolean,
|
|
depth = 0,
|
|
): Promise<AxiosResponse<ArrayBuffer> | RedirectInfo> {
|
|
if (depth > MAX_REDIRECTS) {
|
|
throw new Error(`Too many redirects (exceeded ${MAX_REDIRECTS})`)
|
|
}
|
|
try {
|
|
return await axios.get(url, {
|
|
signal,
|
|
timeout: getFetchTimeoutMs(),
|
|
maxRedirects: 0,
|
|
responseType: 'arraybuffer',
|
|
maxContentLength: MAX_HTTP_CONTENT_LENGTH,
|
|
headers: {
|
|
Accept: 'text/markdown, text/html, */*',
|
|
'User-Agent': getWebFetchUserAgent(),
|
|
},
|
|
})
|
|
} catch (error) {
|
|
if (
|
|
axios.isAxiosError(error) &&
|
|
error.response &&
|
|
[301, 302, 307, 308].includes(error.response.status)
|
|
) {
|
|
const redirectLocation = getResponseHeader(
|
|
error.response.headers,
|
|
'location',
|
|
)
|
|
if (!redirectLocation) {
|
|
throw new Error('Redirect missing Location header')
|
|
}
|
|
|
|
// Resolve relative URLs against the original URL
|
|
const redirectUrl = new URL(redirectLocation, url).toString()
|
|
|
|
if (redirectChecker(url, redirectUrl)) {
|
|
// Recursively follow the permitted redirect
|
|
return getWithPermittedRedirects(
|
|
redirectUrl,
|
|
signal,
|
|
redirectChecker,
|
|
depth + 1,
|
|
)
|
|
} else {
|
|
// Return redirect information to the caller
|
|
return {
|
|
type: 'redirect',
|
|
originalUrl: url,
|
|
redirectUrl,
|
|
statusCode: error.response.status,
|
|
}
|
|
}
|
|
}
|
|
|
|
// Detect egress proxy blocks: the proxy returns 403 with
|
|
// X-Proxy-Error: blocked-by-allowlist when egress is restricted
|
|
if (
|
|
axios.isAxiosError(error) &&
|
|
error.response?.status === 403 &&
|
|
getResponseHeader(error.response.headers, 'x-proxy-error') ===
|
|
'blocked-by-allowlist'
|
|
) {
|
|
const hostname = new URL(url).hostname
|
|
throw new EgressBlockedError(hostname)
|
|
}
|
|
|
|
throw error
|
|
}
|
|
}
|
|
|
|
function isRedirectInfo(
|
|
response: AxiosResponse<ArrayBuffer> | RedirectInfo,
|
|
): response is RedirectInfo {
|
|
return 'type' in response && response.type === 'redirect'
|
|
}
|
|
|
|
export type FetchedContent = {
|
|
content: string
|
|
bytes: number
|
|
code: number
|
|
codeText: string
|
|
contentType: string
|
|
persistedPath?: string
|
|
persistedSize?: number
|
|
}
|
|
|
|
export async function getURLMarkdownContent(
|
|
url: string,
|
|
abortController: AbortController,
|
|
): Promise<FetchedContent | RedirectInfo> {
|
|
if (!validateURL(url)) {
|
|
throw new Error('Invalid URL')
|
|
}
|
|
|
|
// Check cache (LRUCache handles TTL automatically)
|
|
const cachedEntry = URL_CACHE.get(url)
|
|
if (cachedEntry) {
|
|
return {
|
|
bytes: cachedEntry.bytes,
|
|
code: cachedEntry.code,
|
|
codeText: cachedEntry.codeText,
|
|
content: cachedEntry.content,
|
|
contentType: cachedEntry.contentType,
|
|
persistedPath: cachedEntry.persistedPath,
|
|
persistedSize: cachedEntry.persistedSize,
|
|
}
|
|
}
|
|
|
|
let parsedUrl: URL
|
|
let upgradedUrl = url
|
|
|
|
try {
|
|
parsedUrl = new URL(url)
|
|
|
|
// Upgrade http to https if needed
|
|
if (parsedUrl.protocol === 'http:') {
|
|
parsedUrl.protocol = 'https:'
|
|
upgradedUrl = parsedUrl.toString()
|
|
}
|
|
|
|
const hostname = parsedUrl.hostname
|
|
|
|
if (process.env.USER_TYPE === 'ant') {
|
|
logEvent('tengu_web_fetch_host', {
|
|
hostname:
|
|
hostname as AnalyticsMetadata_I_VERIFIED_THIS_IS_NOT_CODE_OR_FILEPATHS,
|
|
})
|
|
}
|
|
} catch (e) {
|
|
logError(e)
|
|
}
|
|
|
|
const response = await getWithPermittedRedirects(
|
|
upgradedUrl,
|
|
abortController.signal,
|
|
isPermittedRedirect,
|
|
)
|
|
|
|
// Check if we got a redirect response
|
|
if (isRedirectInfo(response)) {
|
|
return response
|
|
}
|
|
|
|
const rawBuffer = Buffer.from(response.data)
|
|
// Release the axios-held ArrayBuffer copy; rawBuffer owns the bytes now.
|
|
// This lets GC reclaim up to MAX_HTTP_CONTENT_LENGTH (10MB) before Turndown
|
|
// builds its DOM tree (which can be 3-5x the HTML size).
|
|
;(response as { data: unknown }).data = null
|
|
const contentType = getResponseHeader(response.headers, 'content-type') ?? ''
|
|
|
|
// Binary content: save raw bytes to disk with a proper extension so Claude
|
|
// can inspect the file later. We still fall through to the utf-8 decode +
|
|
// Haiku path below — for PDFs in particular the decoded string has enough
|
|
// ASCII structure (/Title, text streams) that Haiku can summarize it, and
|
|
// the saved file is a supplement rather than a replacement.
|
|
let persistedPath: string | undefined
|
|
let persistedSize: number | undefined
|
|
if (isBinaryContentType(contentType)) {
|
|
const persistId = `webfetch-${Date.now()}-${Math.random().toString(36).slice(2, 8)}`
|
|
const result = await persistBinaryContent(rawBuffer, contentType, persistId)
|
|
if (!('error' in result)) {
|
|
persistedPath = result.filepath
|
|
persistedSize = result.size
|
|
}
|
|
}
|
|
|
|
const bytes = rawBuffer.length
|
|
const htmlContent = rawBuffer.toString('utf-8')
|
|
|
|
let markdownContent: string
|
|
let contentBytes: number
|
|
if (contentType.includes('text/html')) {
|
|
markdownContent = (await getTurndownService()).turndown(htmlContent)
|
|
contentBytes = Buffer.byteLength(markdownContent)
|
|
} else {
|
|
// It's not HTML - just use it raw. The decoded string's UTF-8 byte
|
|
// length equals rawBuffer.length (modulo U+FFFD replacement on invalid
|
|
// bytes — negligible for cache eviction accounting), so skip the O(n)
|
|
// Buffer.byteLength scan.
|
|
markdownContent = htmlContent
|
|
contentBytes = bytes
|
|
}
|
|
|
|
// Store the fetched content in cache. Note that it's stored under
|
|
// the original URL, not the upgraded or redirected URL.
|
|
const entry: CacheEntry = {
|
|
bytes,
|
|
code: response.status,
|
|
codeText: response.statusText,
|
|
content: markdownContent,
|
|
contentType,
|
|
persistedPath,
|
|
persistedSize,
|
|
}
|
|
// lru-cache requires positive integers; clamp to 1 for empty responses.
|
|
URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) })
|
|
return entry
|
|
}
|
|
|
|
/**
|
|
* Fetch URL content via Tavily Extract API, which directly returns Markdown.
|
|
* This skips the HTML→Markdown conversion (turndown) and the secondary
|
|
* model call (queryHaiku) — Tavily already delivers clean Markdown.
|
|
*/
|
|
export async function fetchContentWithTavily(
|
|
url: string,
|
|
abortController: AbortController,
|
|
): Promise<FetchedContent | RedirectInfo> {
|
|
if (!validateURL(url)) {
|
|
throw new Error('Invalid URL')
|
|
}
|
|
|
|
// Check cache (LRUCache handles TTL automatically)
|
|
const cachedEntry = URL_CACHE.get(url)
|
|
if (cachedEntry) {
|
|
return {
|
|
bytes: cachedEntry.bytes,
|
|
code: cachedEntry.code,
|
|
codeText: cachedEntry.codeText,
|
|
content: cachedEntry.content,
|
|
contentType: cachedEntry.contentType,
|
|
persistedPath: cachedEntry.persistedPath,
|
|
persistedSize: cachedEntry.persistedSize,
|
|
}
|
|
}
|
|
|
|
let parsedUrl: URL
|
|
try {
|
|
parsedUrl = new URL(url)
|
|
} catch {
|
|
throw new Error('Invalid URL')
|
|
}
|
|
|
|
// Upgrade http to https if needed
|
|
if (parsedUrl.protocol === 'http:') {
|
|
parsedUrl.protocol = 'https:'
|
|
url = parsedUrl.toString()
|
|
}
|
|
|
|
const abortSignal = abortController.signal
|
|
|
|
const settings = getSettings_DEPRECATED() as Record<string, unknown> & {
|
|
tavilyEndpointUrl?: string
|
|
}
|
|
const baseUrl = settings.tavilyEndpointUrl || DEFAULT_TAVILY_EXTRACT_URL
|
|
// Derive extract URL from the base Tavily endpoint
|
|
const extractUrl = baseUrl.endsWith('/search')
|
|
? baseUrl.replace(/\/search$/, '/extract')
|
|
: baseUrl.endsWith('/extract')
|
|
? baseUrl
|
|
: `${baseUrl.replace(/\/$/, '')}/extract`
|
|
|
|
const response = await axios.post<{ url: string; raw_content: string }>(
|
|
extractUrl,
|
|
{
|
|
urls: [url],
|
|
},
|
|
{
|
|
signal: abortSignal,
|
|
timeout: getFetchTimeoutMs(),
|
|
headers: { 'Content-Type': 'application/json' },
|
|
},
|
|
)
|
|
|
|
if (abortSignal.aborted) {
|
|
throw new AbortError()
|
|
}
|
|
|
|
const rawContent = response.data?.raw_content ?? ''
|
|
// If raw_content is a JSON string (extract may return {url:..., raw_content:...}
|
|
// per URL), unwrap it.
|
|
let markdownContent = rawContent
|
|
if (!markdownContent.trim()) {
|
|
// Try to extract from results array
|
|
const resp = response.data as unknown as {
|
|
results?: Array<{ raw_content?: string }>
|
|
}
|
|
const results = resp.results ?? []
|
|
if (results.length > 0 && results[0].raw_content) {
|
|
markdownContent = results[0].raw_content
|
|
}
|
|
}
|
|
|
|
if (!markdownContent.trim()) {
|
|
throw new Error(
|
|
`Tavily Extract returned empty content for ${url}. The page may require authentication or JavaScript rendering.`,
|
|
)
|
|
}
|
|
|
|
const contentBytes = Buffer.byteLength(markdownContent)
|
|
|
|
const entry: CacheEntry = {
|
|
bytes: contentBytes,
|
|
code: 200,
|
|
codeText: 'OK',
|
|
content: markdownContent,
|
|
contentType: 'text/markdown',
|
|
}
|
|
URL_CACHE.set(url, entry, { size: Math.max(1, contentBytes) })
|
|
return entry
|
|
}
|
|
|
|
export async function applyPromptToMarkdown(
|
|
prompt: string,
|
|
markdownContent: string,
|
|
signal: AbortSignal,
|
|
isNonInteractiveSession: boolean,
|
|
isPreapprovedDomain: boolean,
|
|
): Promise<string> {
|
|
// Truncate content to avoid "Prompt is too long" errors from the secondary model
|
|
const truncatedContent =
|
|
markdownContent.length > MAX_MARKDOWN_LENGTH
|
|
? markdownContent.slice(0, MAX_MARKDOWN_LENGTH) +
|
|
'\n\n[Content truncated due to length...]'
|
|
: markdownContent
|
|
|
|
const modelPrompt = makeSecondaryModelPrompt(
|
|
truncatedContent,
|
|
prompt,
|
|
isPreapprovedDomain,
|
|
)
|
|
const assistantMessage = await queryHaiku({
|
|
systemPrompt: asSystemPrompt([]),
|
|
userPrompt: modelPrompt,
|
|
signal,
|
|
options: {
|
|
querySource: 'web_fetch_apply',
|
|
agents: [],
|
|
isNonInteractiveSession,
|
|
hasAppendSystemPrompt: false,
|
|
mcpTools: [],
|
|
},
|
|
})
|
|
|
|
// We need to bubble this up, so that the tool call throws, causing us to return
|
|
// an is_error tool_use block to the server, and render a red dot in the UI.
|
|
if (signal.aborted) {
|
|
throw new AbortError()
|
|
}
|
|
|
|
const { content } = assistantMessage.message!
|
|
if (content!.length > 0) {
|
|
const contentBlock = content![0]
|
|
if (
|
|
contentBlock &&
|
|
typeof contentBlock === 'object' &&
|
|
'text' in contentBlock
|
|
) {
|
|
return (contentBlock as { text: string }).text
|
|
}
|
|
}
|
|
return 'No response from model'
|
|
}
|