mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-17 13:55:50 +00:00
feat: 添加 GBK 编码自动检测支持,文件读写工具透明处理非 UTF-8 文件
新增 encoding.ts 核心模块实现三层编码检测(BOM → UTF-8 fatal → GBK 回退), 改造同步/异步读取路径和写入路径,使 FileReadTool/FileEditTool/FileWriteTool 能正确处理 GBK 编码文件。包含完整单元测试和 spec 文档。 Co-Authored-By: glm-5-turbo <zai-org@claude-code-best.win>
This commit is contained in:
@@ -26,7 +26,8 @@
|
||||
// On error (including maxBytes exceeded), stream.destroy(err) emits
|
||||
// 'error' → reject (passed directly to .once('error')).
|
||||
//
|
||||
// Both paths strip UTF-8 BOM and \r (CRLF → LF).
|
||||
// Both paths auto-detect encoding via encoding.ts (BOM → UTF-8 fatal → fallback chain),
|
||||
// decode with TextDecoder, and strip BOM and \r (CRLF → LF).
|
||||
//
|
||||
// mtime comes from fstat/stat on the already-open fd — no extra open().
|
||||
//
|
||||
@@ -39,6 +40,7 @@
|
||||
|
||||
import { createReadStream, fstat } from 'fs'
|
||||
import { stat as fsStat, readFile } from 'fs/promises'
|
||||
import { detectEncoding, decodeBuffer } from './encoding.js'
|
||||
import { formatFileSize } from './format.js'
|
||||
|
||||
const FAST_PATH_MAX_SIZE = 10 * 1024 * 1024 // 10 MB
|
||||
@@ -115,7 +117,9 @@ export async function readFileInRange(
|
||||
)
|
||||
}
|
||||
|
||||
const text = await readFile(filePath, { encoding: 'utf8', signal })
|
||||
const rawBuffer = await readFile(filePath, { signal })
|
||||
const encoding = detectEncoding(rawBuffer)
|
||||
const text = decodeBuffer(rawBuffer, encoding)
|
||||
return readFileInRangeFast(
|
||||
text,
|
||||
stats.mtimeMs,
|
||||
@@ -227,6 +231,12 @@ type StreamState = {
|
||||
isFirstChunk: boolean
|
||||
resolveMtime: (ms: number) => void
|
||||
mtimeReady: Promise<number>
|
||||
/** Encoding detection state: null = not yet detected, string = detected */
|
||||
encoding: string | null
|
||||
/** TextDecoder instance: created after detection, used for streaming decode */
|
||||
decoder: TextDecoder | null
|
||||
/** Detection phase buffer: collects raw bytes until 4KB or stream end */
|
||||
detectionBuffer: number[]
|
||||
}
|
||||
|
||||
function streamOnOpen(this: StreamState, fd: number): void {
|
||||
@@ -235,15 +245,71 @@ function streamOnOpen(this: StreamState, fd: number): void {
|
||||
})
|
||||
}
|
||||
|
||||
function streamOnData(this: StreamState, chunk: string): void {
|
||||
if (this.isFirstChunk) {
|
||||
this.isFirstChunk = false
|
||||
if (chunk.charCodeAt(0) === 0xfeff) {
|
||||
chunk = chunk.slice(1)
|
||||
function processTextChunk(state: StreamState, text: string): void {
|
||||
// BOM stripping (first chunk only)
|
||||
if (state.isFirstChunk) {
|
||||
state.isFirstChunk = false
|
||||
if (text.charCodeAt(0) === 0xfeff) {
|
||||
text = text.slice(1)
|
||||
}
|
||||
}
|
||||
|
||||
this.totalBytesRead += Buffer.byteLength(chunk)
|
||||
const data = state.partial.length > 0 ? state.partial + text : text
|
||||
state.partial = ''
|
||||
|
||||
let startPos = 0
|
||||
let newlinePos: number
|
||||
while ((newlinePos = data.indexOf('\n', startPos)) !== -1) {
|
||||
if (
|
||||
state.currentLineIndex >= state.offset &&
|
||||
state.currentLineIndex < state.endLine
|
||||
) {
|
||||
let line = data.slice(startPos, newlinePos)
|
||||
if (line.endsWith('\r')) {
|
||||
line = line.slice(0, -1)
|
||||
}
|
||||
if (state.truncateOnByteLimit && state.maxBytes !== undefined) {
|
||||
const sep = state.selectedLines.length > 0 ? 1 : 0
|
||||
const nextBytes = state.selectedBytes + sep + Buffer.byteLength(line)
|
||||
if (nextBytes > state.maxBytes) {
|
||||
state.truncatedByBytes = true
|
||||
state.endLine = state.currentLineIndex
|
||||
} else {
|
||||
state.selectedBytes = nextBytes
|
||||
state.selectedLines.push(line)
|
||||
}
|
||||
} else {
|
||||
state.selectedLines.push(line)
|
||||
}
|
||||
}
|
||||
state.currentLineIndex++
|
||||
startPos = newlinePos + 1
|
||||
}
|
||||
|
||||
if (startPos < data.length) {
|
||||
if (
|
||||
state.currentLineIndex >= state.offset &&
|
||||
state.currentLineIndex < state.endLine
|
||||
) {
|
||||
const fragment = data.slice(startPos)
|
||||
if (state.truncateOnByteLimit && state.maxBytes !== undefined) {
|
||||
const sep = state.selectedLines.length > 0 ? 1 : 0
|
||||
const fragBytes =
|
||||
state.selectedBytes + sep + Buffer.byteLength(fragment)
|
||||
if (fragBytes > state.maxBytes) {
|
||||
state.truncatedByBytes = true
|
||||
state.endLine = state.currentLineIndex
|
||||
return
|
||||
}
|
||||
}
|
||||
state.partial = fragment
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function streamOnData(this: StreamState, chunk: Buffer): void {
|
||||
this.totalBytesRead += chunk.length
|
||||
|
||||
if (
|
||||
!this.truncateOnByteLimit &&
|
||||
this.maxBytes !== undefined &&
|
||||
@@ -255,69 +321,47 @@ function streamOnData(this: StreamState, chunk: string): void {
|
||||
return
|
||||
}
|
||||
|
||||
const data = this.partial.length > 0 ? this.partial + chunk : chunk
|
||||
this.partial = ''
|
||||
|
||||
let startPos = 0
|
||||
let newlinePos: number
|
||||
while ((newlinePos = data.indexOf('\n', startPos)) !== -1) {
|
||||
if (
|
||||
this.currentLineIndex >= this.offset &&
|
||||
this.currentLineIndex < this.endLine
|
||||
) {
|
||||
let line = data.slice(startPos, newlinePos)
|
||||
if (line.endsWith('\r')) {
|
||||
line = line.slice(0, -1)
|
||||
}
|
||||
if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
|
||||
const sep = this.selectedLines.length > 0 ? 1 : 0
|
||||
const nextBytes = this.selectedBytes + sep + Buffer.byteLength(line)
|
||||
if (nextBytes > this.maxBytes) {
|
||||
// Cap hit — collapse the selection range so nothing more is
|
||||
// accumulated. Stream continues (to count totalLines).
|
||||
this.truncatedByBytes = true
|
||||
this.endLine = this.currentLineIndex
|
||||
} else {
|
||||
this.selectedBytes = nextBytes
|
||||
this.selectedLines.push(line)
|
||||
}
|
||||
} else {
|
||||
this.selectedLines.push(line)
|
||||
}
|
||||
// Phase 1: Encoding detection
|
||||
if (this.encoding === null) {
|
||||
for (let i = 0; i < chunk.length; i++) {
|
||||
this.detectionBuffer.push(chunk[i])
|
||||
}
|
||||
this.currentLineIndex++
|
||||
startPos = newlinePos + 1
|
||||
|
||||
// Collected at least 4KB, perform encoding detection
|
||||
if (this.detectionBuffer.length >= 4096) {
|
||||
this.encoding = detectEncoding(Buffer.from(this.detectionBuffer))
|
||||
this.decoder = new TextDecoder(this.encoding, {
|
||||
stream: true,
|
||||
} as TextDecoderOptions)
|
||||
|
||||
// Decode the detection buffer and feed to line scanning
|
||||
const decoded = this.decoder.decode(Buffer.from(this.detectionBuffer))
|
||||
this.detectionBuffer = []
|
||||
processTextChunk(this, decoded)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Only keep the trailing fragment when inside the selected range.
|
||||
// Outside the range we just count newlines — discarding prevents
|
||||
// unbounded memory growth on huge single-line files.
|
||||
if (startPos < data.length) {
|
||||
if (
|
||||
this.currentLineIndex >= this.offset &&
|
||||
this.currentLineIndex < this.endLine
|
||||
) {
|
||||
const fragment = data.slice(startPos)
|
||||
// In truncate mode, `partial` can grow unboundedly if the selected
|
||||
// range contains a huge single line (no newline across many chunks).
|
||||
// Once the fragment alone would overflow the remaining budget, we know
|
||||
// the completed line can never fit — set truncated, collapse the
|
||||
// selection range, and discard the fragment to stop accumulation.
|
||||
if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
|
||||
const sep = this.selectedLines.length > 0 ? 1 : 0
|
||||
const fragBytes = this.selectedBytes + sep + Buffer.byteLength(fragment)
|
||||
if (fragBytes > this.maxBytes) {
|
||||
this.truncatedByBytes = true
|
||||
this.endLine = this.currentLineIndex
|
||||
return
|
||||
}
|
||||
}
|
||||
this.partial = fragment
|
||||
}
|
||||
}
|
||||
// Phase 2: Decoding
|
||||
const decoded = this.decoder!.decode(chunk, {
|
||||
stream: true,
|
||||
} as unknown as TextDecodeOptions)
|
||||
processTextChunk(this, decoded)
|
||||
}
|
||||
|
||||
function streamOnEnd(this: StreamState): void {
|
||||
// If stream ended before detection completed (< 4KB file), detect now
|
||||
if (this.encoding === null) {
|
||||
this.encoding = detectEncoding(Buffer.from(this.detectionBuffer))
|
||||
this.decoder = new TextDecoder(this.encoding, {
|
||||
stream: true,
|
||||
} as TextDecoderOptions)
|
||||
const decoded = this.decoder.decode(Buffer.from(this.detectionBuffer))
|
||||
this.detectionBuffer = []
|
||||
processTextChunk(this, decoded)
|
||||
}
|
||||
|
||||
// Handle final fragment
|
||||
let line = this.partial
|
||||
if (line.endsWith('\r')) {
|
||||
line = line.slice(0, -1)
|
||||
@@ -366,7 +410,6 @@ function readFileInRangeStreaming(
|
||||
return new Promise((resolve, reject) => {
|
||||
const state: StreamState = {
|
||||
stream: createReadStream(filePath, {
|
||||
encoding: 'utf8',
|
||||
highWaterMark: 512 * 1024,
|
||||
...(signal ? { signal } : undefined),
|
||||
}),
|
||||
@@ -384,6 +427,9 @@ function readFileInRangeStreaming(
|
||||
isFirstChunk: true,
|
||||
resolveMtime: () => {},
|
||||
mtimeReady: null as unknown as Promise<number>,
|
||||
encoding: null,
|
||||
decoder: null,
|
||||
detectionBuffer: [],
|
||||
}
|
||||
state.mtimeReady = new Promise<number>(r => {
|
||||
state.resolveMtime = r
|
||||
|
||||
Reference in New Issue
Block a user