feat: 添加 GBK 编码自动检测支持,文件读写工具透明处理非 UTF-8 文件

新增 encoding.ts 核心模块实现三层编码检测(BOM → UTF-8 fatal → GBK 回退),
改造同步/异步读取路径和写入路径,使 FileReadTool/FileEditTool/FileWriteTool
能正确处理 GBK 编码文件。包含完整单元测试和 spec 文档。

Co-Authored-By: glm-5-turbo <zai-org@claude-code-best.win>
This commit is contained in:
claude-code-best
2026-05-10 20:50:12 +08:00
parent 6e1d3d8f47
commit 0ce8f7a1cb
22 changed files with 1728 additions and 121 deletions

View File

@@ -13,39 +13,24 @@
*/
import { logForDebugging } from './debug.js'
import { type FileEncoding, decodeBuffer, detectEncoding } from './encoding.js'
import { getFsImplementation, safeResolvePath } from './fsOperations.js'
export type LineEndingType = 'CRLF' | 'LF'
export function detectEncodingForResolvedPath(
resolvedPath: string,
): BufferEncoding {
): FileEncoding {
const { buffer, bytesRead } = getFsImplementation().readSync(resolvedPath, {
length: 4096,
})
// Empty files should default to utf8, not ascii
// This fixes a bug where writing emojis/CJK to empty files caused corruption
// Empty files default to utf8 nothing to detect
if (bytesRead === 0) {
return 'utf8'
}
if (bytesRead >= 2) {
if (buffer[0] === 0xff && buffer[1] === 0xfe) return 'utf16le'
}
if (
bytesRead >= 3 &&
buffer[0] === 0xef &&
buffer[1] === 0xbb &&
buffer[2] === 0xbf
) {
return 'utf8'
}
// For non-empty files, default to utf8 since it's a superset of ascii
// and handles all Unicode characters properly
return 'utf8'
return detectEncoding(buffer.subarray(0, bytesRead))
}
export function detectLineEndingsForString(content: string): LineEndingType {
@@ -74,7 +59,7 @@ export function detectLineEndingsForString(content: string): LineEndingType {
*/
export function readFileSyncWithMetadata(filePath: string): {
content: string
encoding: BufferEncoding
encoding: FileEncoding
lineEndings: LineEndingType
} {
const fs = getFsImplementation()
@@ -85,10 +70,10 @@ export function readFileSyncWithMetadata(filePath: string): {
}
const encoding = detectEncodingForResolvedPath(resolvedPath)
const raw = fs.readFileSync(resolvedPath, { encoding })
// Detect line endings from the raw head before CRLF normalization erases
// the distinction. 4096 code units is ≥ detectLineEndings's 4096-byte
// readSync sample (line endings are ASCII, so the unit mismatch is moot).
// Read raw Buffer first — readFileSync encoding option only accepts
// BufferEncoding, not gbk etc.
const rawBuffer = fs.readFileBytesSync(resolvedPath)
const raw = decodeBuffer(rawBuffer, encoding)
const lineEndings = detectLineEndingsForString(raw.slice(0, 4096))
return {
content: raw.replaceAll('\r\n', '\n'),