feat: 添加 GBK 编码自动检测支持，文件读写工具透明处理非 UTF-8 文件

新增 encoding.ts 核心模块实现三层编码检测（BOM → UTF-8 fatal → GBK 回退），改造同步/异步读取路径和写入路径，使 FileReadTool/FileEditTool/FileWriteTool 能正确处理 GBK 编码文件。包含完整单元测试和 spec 文档。 Co-Authored-By: glm-5-turbo <zai-org@claude-code-best.win>
2026-06-17 13:55:50 +00:00 · 2026-05-10 20:50:12 +08:00
parent 6e1d3d8f47
commit 0ce8f7a1cb
22 changed files with 1728 additions and 121 deletions
--- a/src/utils/readFileInRange.ts
+++ b/src/utils/readFileInRange.ts
@@ -26,7 +26,8 @@
 //   On error (including maxBytes exceeded), stream.destroy(err) emits
 //   'error' → reject (passed directly to .once('error')).
 //
-// Both paths strip UTF-8 BOM and \r (CRLF → LF).
+// Both paths auto-detect encoding via encoding.ts (BOM → UTF-8 fatal → fallback chain),
+// decode with TextDecoder, and strip BOM and \r (CRLF → LF).
 //
 // mtime comes from fstat/stat on the already-open fd — no extra open().
 //
@@ -39,6 +40,7 @@

 import { createReadStream, fstat } from 'fs'
 import { stat as fsStat, readFile } from 'fs/promises'
+import { detectEncoding, decodeBuffer } from './encoding.js'
 import { formatFileSize } from './format.js'

 const FAST_PATH_MAX_SIZE = 10 * 1024 * 1024 // 10 MB
@@ -115,7 +117,9 @@ export async function readFileInRange(
      )
    }

-    const text = await readFile(filePath, { encoding: 'utf8', signal })
+    const rawBuffer = await readFile(filePath, { signal })
+    const encoding = detectEncoding(rawBuffer)
+    const text = decodeBuffer(rawBuffer, encoding)
    return readFileInRangeFast(
      text,
      stats.mtimeMs,
@@ -227,6 +231,12 @@ type StreamState = {
  isFirstChunk: boolean
  resolveMtime: (ms: number) => void
  mtimeReady: Promise<number>
+  /** Encoding detection state: null = not yet detected, string = detected */
+  encoding: string | null
+  /** TextDecoder instance: created after detection, used for streaming decode */
+  decoder: TextDecoder | null
+  /** Detection phase buffer: collects raw bytes until 4KB or stream end */
+  detectionBuffer: number[]
 }

 function streamOnOpen(this: StreamState, fd: number): void {
@@ -235,15 +245,71 @@ function streamOnOpen(this: StreamState, fd: number): void {
  })
 }

-function streamOnData(this: StreamState, chunk: string): void {
-  if (this.isFirstChunk) {
-    this.isFirstChunk = false
-    if (chunk.charCodeAt(0) === 0xfeff) {
-      chunk = chunk.slice(1)
+function processTextChunk(state: StreamState, text: string): void {
+  // BOM stripping (first chunk only)
+  if (state.isFirstChunk) {
+    state.isFirstChunk = false
+    if (text.charCodeAt(0) === 0xfeff) {
+      text = text.slice(1)
    }
  }

-  this.totalBytesRead += Buffer.byteLength(chunk)
+  const data = state.partial.length > 0 ? state.partial + text : text
+  state.partial = ''
+
+  let startPos = 0
+  let newlinePos: number
+  while ((newlinePos = data.indexOf('\n', startPos)) !== -1) {
+    if (
+      state.currentLineIndex >= state.offset &&
+      state.currentLineIndex < state.endLine
+    ) {
+      let line = data.slice(startPos, newlinePos)
+      if (line.endsWith('\r')) {
+        line = line.slice(0, -1)
+      }
+      if (state.truncateOnByteLimit && state.maxBytes !== undefined) {
+        const sep = state.selectedLines.length > 0 ? 1 : 0
+        const nextBytes = state.selectedBytes + sep + Buffer.byteLength(line)
+        if (nextBytes > state.maxBytes) {
+          state.truncatedByBytes = true
+          state.endLine = state.currentLineIndex
+        } else {
+          state.selectedBytes = nextBytes
+          state.selectedLines.push(line)
+        }
+      } else {
+        state.selectedLines.push(line)
+      }
+    }
+    state.currentLineIndex++
+    startPos = newlinePos + 1
+  }
+
+  if (startPos < data.length) {
+    if (
+      state.currentLineIndex >= state.offset &&
+      state.currentLineIndex < state.endLine
+    ) {
+      const fragment = data.slice(startPos)
+      if (state.truncateOnByteLimit && state.maxBytes !== undefined) {
+        const sep = state.selectedLines.length > 0 ? 1 : 0
+        const fragBytes =
+          state.selectedBytes + sep + Buffer.byteLength(fragment)
+        if (fragBytes > state.maxBytes) {
+          state.truncatedByBytes = true
+          state.endLine = state.currentLineIndex
+          return
+        }
+      }
+      state.partial = fragment
+    }
+  }
+}
+
+function streamOnData(this: StreamState, chunk: Buffer): void {
+  this.totalBytesRead += chunk.length
+
  if (
    !this.truncateOnByteLimit &&
    this.maxBytes !== undefined &&
@@ -255,69 +321,47 @@ function streamOnData(this: StreamState, chunk: string): void {
    return
  }

-  const data = this.partial.length > 0 ? this.partial + chunk : chunk
-  this.partial = ''
-
-  let startPos = 0
-  let newlinePos: number
-  while ((newlinePos = data.indexOf('\n', startPos)) !== -1) {
-    if (
-      this.currentLineIndex >= this.offset &&
-      this.currentLineIndex < this.endLine
-    ) {
-      let line = data.slice(startPos, newlinePos)
-      if (line.endsWith('\r')) {
-        line = line.slice(0, -1)
-      }
-      if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
-        const sep = this.selectedLines.length > 0 ? 1 : 0
-        const nextBytes = this.selectedBytes + sep + Buffer.byteLength(line)
-        if (nextBytes > this.maxBytes) {
-          // Cap hit — collapse the selection range so nothing more is
-          // accumulated.  Stream continues (to count totalLines).
-          this.truncatedByBytes = true
-          this.endLine = this.currentLineIndex
-        } else {
-          this.selectedBytes = nextBytes
-          this.selectedLines.push(line)
-        }
-      } else {
-        this.selectedLines.push(line)
-      }
+  // Phase 1: Encoding detection
+  if (this.encoding === null) {
+    for (let i = 0; i < chunk.length; i++) {
+      this.detectionBuffer.push(chunk[i])
    }
-    this.currentLineIndex++
-    startPos = newlinePos + 1
+
+    // Collected at least 4KB, perform encoding detection
+    if (this.detectionBuffer.length >= 4096) {
+      this.encoding = detectEncoding(Buffer.from(this.detectionBuffer))
+      this.decoder = new TextDecoder(this.encoding, {
+        stream: true,
+      } as TextDecoderOptions)
+
+      // Decode the detection buffer and feed to line scanning
+      const decoded = this.decoder.decode(Buffer.from(this.detectionBuffer))
+      this.detectionBuffer = []
+      processTextChunk(this, decoded)
+    }
+    return
  }

-  // Only keep the trailing fragment when inside the selected range.
-  // Outside the range we just count newlines — discarding prevents
-  // unbounded memory growth on huge single-line files.
-  if (startPos < data.length) {
-    if (
-      this.currentLineIndex >= this.offset &&
-      this.currentLineIndex < this.endLine
-    ) {
-      const fragment = data.slice(startPos)
-      // In truncate mode, `partial` can grow unboundedly if the selected
-      // range contains a huge single line (no newline across many chunks).
-      // Once the fragment alone would overflow the remaining budget, we know
-      // the completed line can never fit — set truncated, collapse the
-      // selection range, and discard the fragment to stop accumulation.
-      if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
-        const sep = this.selectedLines.length > 0 ? 1 : 0
-        const fragBytes = this.selectedBytes + sep + Buffer.byteLength(fragment)
-        if (fragBytes > this.maxBytes) {
-          this.truncatedByBytes = true
-          this.endLine = this.currentLineIndex
-          return
-        }
-      }
-      this.partial = fragment
-    }
-  }
+  // Phase 2: Decoding
+  const decoded = this.decoder!.decode(chunk, {
+    stream: true,
+  } as unknown as TextDecodeOptions)
+  processTextChunk(this, decoded)
 }

 function streamOnEnd(this: StreamState): void {
+  // If stream ended before detection completed (< 4KB file), detect now
+  if (this.encoding === null) {
+    this.encoding = detectEncoding(Buffer.from(this.detectionBuffer))
+    this.decoder = new TextDecoder(this.encoding, {
+      stream: true,
+    } as TextDecoderOptions)
+    const decoded = this.decoder.decode(Buffer.from(this.detectionBuffer))
+    this.detectionBuffer = []
+    processTextChunk(this, decoded)
+  }
+
+  // Handle final fragment
  let line = this.partial
  if (line.endsWith('\r')) {
    line = line.slice(0, -1)
@@ -366,7 +410,6 @@ function readFileInRangeStreaming(
  return new Promise((resolve, reject) => {
    const state: StreamState = {
      stream: createReadStream(filePath, {
-        encoding: 'utf8',
        highWaterMark: 512 * 1024,
        ...(signal ? { signal } : undefined),
      }),
@@ -384,6 +427,9 @@ function readFileInRangeStreaming(
      isFirstChunk: true,
      resolveMtime: () => {},
      mtimeReady: null as unknown as Promise<number>,
+      encoding: null,
+      decoder: null,
+      detectionBuffer: [],
    }
    state.mtimeReady = new Promise<number>(r => {
      state.resolveMtime = r