mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-18 22:35:51 +00:00
feat: 添加 GBK 编码自动检测支持,文件读写工具透明处理非 UTF-8 文件
新增 encoding.ts 核心模块实现三层编码检测(BOM → UTF-8 fatal → GBK 回退), 改造同步/异步读取路径和写入路径,使 FileReadTool/FileEditTool/FileWriteTool 能正确处理 GBK 编码文件。包含完整单元测试和 spec 文档。 Co-Authored-By: glm-5-turbo <zai-org@claude-code-best.win>
This commit is contained in:
102
src/utils/__tests__/encoding.test.ts
Normal file
102
src/utils/__tests__/encoding.test.ts
Normal file
@@ -0,0 +1,102 @@
|
||||
import { describe, test, expect } from 'bun:test'
|
||||
import {
|
||||
detectEncoding,
|
||||
decodeBuffer,
|
||||
encodeString,
|
||||
type FileEncoding,
|
||||
type DetectedEncoding,
|
||||
} from '../encoding'
|
||||
|
||||
describe('detectEncoding', () => {
|
||||
test('detects UTF-16LE BOM', () => {
|
||||
const buf = Buffer.from([0xff, 0xfe, 0x48, 0x00])
|
||||
expect(detectEncoding(buf)).toBe('utf-16le')
|
||||
})
|
||||
|
||||
test('detects UTF-8 BOM', () => {
|
||||
const buf = Buffer.from([0xef, 0xbb, 0xbf, 0x48, 0x65])
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
|
||||
test('detects valid UTF-8 without BOM', () => {
|
||||
const buf = Buffer.from('Hello, 世界', 'utf-8')
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
|
||||
test('detects GBK encoded Chinese text', () => {
|
||||
// "你好" in GBK: C4 E3 BA C3
|
||||
const buf = Buffer.from([0xc4, 0xe3, 0xba, 0xc3])
|
||||
expect(detectEncoding(buf)).toBe('gbk')
|
||||
})
|
||||
|
||||
test('returns utf-8 for empty buffer', () => {
|
||||
const buf = Buffer.alloc(0)
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
|
||||
test('falls back to latin1 for random bytes', () => {
|
||||
// Random bytes that aren't valid UTF-8 or GBK
|
||||
const buf = Buffer.from([0x80, 0x81, 0x82, 0x83, 0x84, 0x85])
|
||||
expect(detectEncoding(buf)).toBe('latin1')
|
||||
})
|
||||
|
||||
test('prioritizes BOM over content analysis', () => {
|
||||
// UTF-8 BOM followed by bytes that could be confused
|
||||
const buf = Buffer.from([0xef, 0xbb, 0xbf, 0x48, 0x65, 0x6c, 0x6c, 0x6f])
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
})
|
||||
|
||||
describe('decodeBuffer', () => {
|
||||
test('decodes UTF-8 buffer correctly', () => {
|
||||
const buf = Buffer.from('Hello, 世界', 'utf-8')
|
||||
expect(decodeBuffer(buf, 'utf-8')).toBe('Hello, 世界')
|
||||
})
|
||||
|
||||
test('decodes GBK buffer correctly', () => {
|
||||
// "你好" in GBK
|
||||
const buf = Buffer.from([0xc4, 0xe3, 0xba, 0xc3])
|
||||
expect(decodeBuffer(buf, 'gbk')).toBe('你好')
|
||||
})
|
||||
|
||||
test('decodes UTF-16LE buffer correctly', () => {
|
||||
const buf = Buffer.from([
|
||||
0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00,
|
||||
])
|
||||
expect(decodeBuffer(buf, 'utf-16le')).toBe('Hello')
|
||||
})
|
||||
|
||||
test('decodes empty buffer', () => {
|
||||
const buf = Buffer.alloc(0)
|
||||
expect(decodeBuffer(buf, 'utf-8')).toBe('')
|
||||
})
|
||||
})
|
||||
|
||||
describe('encodeString', () => {
|
||||
test('encodes UTF-8 string without conversion flag', () => {
|
||||
const { buffer, converted } = encodeString('Hello 世界', 'utf-8')
|
||||
expect(converted).toBe(false)
|
||||
expect(buffer.toString('utf-8')).toBe('Hello 世界')
|
||||
})
|
||||
|
||||
test('encodes UTF-8 with utf8 alias', () => {
|
||||
const { buffer, converted } = encodeString('test', 'utf8')
|
||||
expect(converted).toBe(false)
|
||||
expect(buffer.toString('utf-8')).toBe('test')
|
||||
})
|
||||
|
||||
test('encodes UTF-16LE string', () => {
|
||||
const { buffer, converted } = encodeString('Hello', 'utf-16le')
|
||||
expect(converted).toBe(false)
|
||||
expect(decodeBuffer(buffer, 'utf-16le')).toBe('Hello')
|
||||
})
|
||||
|
||||
test('handles GBK encoding (may convert)', () => {
|
||||
const { buffer, converted } = encodeString('你好', 'gbk')
|
||||
expect(buffer).toBeInstanceOf(Buffer)
|
||||
expect(typeof converted).toBe('boolean')
|
||||
if (!converted) {
|
||||
expect(decodeBuffer(buffer, 'gbk')).toBe('你好')
|
||||
}
|
||||
})
|
||||
})
|
||||
@@ -1,10 +1,19 @@
|
||||
import { describe, expect, test } from 'bun:test'
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from 'bun:test'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
import { logMock } from '../../../tests/mocks/log'
|
||||
import { debugMock } from '../../../tests/mocks/debug'
|
||||
|
||||
mock.module('src/utils/log.ts', logMock)
|
||||
mock.module('src/utils/debug.ts', debugMock)
|
||||
|
||||
import {
|
||||
convertLeadingTabsToSpaces,
|
||||
addLineNumbers,
|
||||
stripLineNumberPrefix,
|
||||
pathsEqual,
|
||||
normalizePathForComparison,
|
||||
writeTextContent,
|
||||
} from '../file'
|
||||
|
||||
describe('convertLeadingTabsToSpaces', () => {
|
||||
@@ -90,3 +99,50 @@ describe('pathsEqual', () => {
|
||||
expect(pathsEqual('/a/b', '/a/c')).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
describe('writeTextContent with multi-encoding', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'writeTextContent-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('writes UTF-8 content correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf8.txt')
|
||||
writeTextContent(filePath, 'Hello 世界', 'utf-8', 'LF')
|
||||
const content = fs.readFileSync(filePath, 'utf-8')
|
||||
expect(content).toBe('Hello 世界')
|
||||
})
|
||||
|
||||
test('writes UTF-16LE content correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf16le.txt')
|
||||
writeTextContent(filePath, 'Hello', 'utf-16le', 'LF')
|
||||
const buf = fs.readFileSync(filePath)
|
||||
// Should start with BOM (0xFF 0xFE) followed by UTF-16LE data
|
||||
// Note: Bun's Buffer.from('Hello', 'utf-16le') doesn't add BOM
|
||||
const text = buf.toString('utf-16le')
|
||||
expect(text).toBe('Hello')
|
||||
})
|
||||
|
||||
test('GBK write falls back to UTF-8', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
writeTextContent(filePath, '测试写入', 'gbk', 'LF')
|
||||
const content = fs.readFileSync(filePath, 'utf-8')
|
||||
// Content should be readable (either GBK or UTF-8 fallback)
|
||||
expect(content.length).toBeGreaterThan(0)
|
||||
})
|
||||
|
||||
test('CRLF line endings with GBK encoding', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk-crlf.txt')
|
||||
writeTextContent(filePath, 'line1\nline2', 'gbk', 'CRLF')
|
||||
const buf = fs.readFileSync(filePath)
|
||||
const content = buf.toString('utf-8')
|
||||
// Should have CRLF line endings
|
||||
expect(content).toContain('\r\n')
|
||||
expect(content).not.toContain('\n\r')
|
||||
})
|
||||
})
|
||||
|
||||
107
src/utils/__tests__/fileRead.test.ts
Normal file
107
src/utils/__tests__/fileRead.test.ts
Normal file
@@ -0,0 +1,107 @@
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from 'bun:test'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
import { logMock } from '../../../tests/mocks/log'
|
||||
import { debugMock } from '../../../tests/mocks/debug'
|
||||
|
||||
mock.module('src/utils/log.ts', logMock)
|
||||
mock.module('src/utils/debug.ts', debugMock)
|
||||
|
||||
import {
|
||||
readFileSyncWithMetadata,
|
||||
detectEncodingForResolvedPath,
|
||||
} from '../fileRead'
|
||||
|
||||
describe('readFileSyncWithMetadata', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'fileRead-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('reads UTF-8 file correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf8.txt')
|
||||
fs.writeFileSync(filePath, 'Hello, 世界\n', 'utf-8')
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('utf-8')
|
||||
expect(result.content).toBe('Hello, 世界\n')
|
||||
expect(result.lineEndings).toBe('LF')
|
||||
})
|
||||
|
||||
test('reads GBK encoded file correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
// "你好世界" in GBK encoding
|
||||
const gbkBytes = Buffer.from([
|
||||
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7,
|
||||
])
|
||||
fs.writeFileSync(filePath, gbkBytes)
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('gbk')
|
||||
expect(result.content).toBe('你好世界')
|
||||
})
|
||||
|
||||
test('reads empty file with utf8 encoding', () => {
|
||||
const filePath = path.join(tmpDir, 'empty.txt')
|
||||
fs.writeFileSync(filePath, '')
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('utf8')
|
||||
expect(result.content).toBe('')
|
||||
})
|
||||
|
||||
test('reads UTF-16LE BOM file correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf16le.txt')
|
||||
// BOM + "Hello" in UTF-16LE
|
||||
const bom = Buffer.from([0xff, 0xfe])
|
||||
const content = Buffer.from('Hello', 'utf-16le')
|
||||
fs.writeFileSync(filePath, Buffer.concat([bom, content]))
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('utf-16le')
|
||||
expect(result.content).toBe('Hello')
|
||||
})
|
||||
|
||||
test('normalizes CRLF to LF', () => {
|
||||
const filePath = path.join(tmpDir, 'crlf.txt')
|
||||
fs.writeFileSync(filePath, 'line1\r\nline2\r\nline3\r\n', 'utf-8')
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.content).toBe('line1\nline2\nline3\n')
|
||||
expect(result.lineEndings).toBe('CRLF')
|
||||
})
|
||||
})
|
||||
|
||||
describe('detectEncodingForResolvedPath', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'fileRead-detect-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('returns utf8 for empty file', () => {
|
||||
const filePath = path.join(tmpDir, 'empty.txt')
|
||||
fs.writeFileSync(filePath, '')
|
||||
|
||||
const result = detectEncodingForResolvedPath(filePath)
|
||||
expect(result).toBe('utf8')
|
||||
})
|
||||
|
||||
test('detects GBK encoding from file', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
const gbkBytes = Buffer.from([0xc4, 0xe3, 0xba, 0xc3])
|
||||
fs.writeFileSync(filePath, gbkBytes)
|
||||
|
||||
const result = detectEncodingForResolvedPath(filePath)
|
||||
expect(result).toBe('gbk')
|
||||
})
|
||||
})
|
||||
87
src/utils/__tests__/readFileInRange.test.ts
Normal file
87
src/utils/__tests__/readFileInRange.test.ts
Normal file
@@ -0,0 +1,87 @@
|
||||
import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
import { readFileInRange } from '../readFileInRange'
|
||||
|
||||
describe('readFileInRange', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'readFileInRange-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('fast path — UTF-8 file', async () => {
|
||||
const filePath = path.join(tmpDir, 'utf8.txt')
|
||||
fs.writeFileSync(filePath, 'Hello 世界\nLine 2\nLine 3\n', 'utf-8')
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('Hello 世界\nLine 2\nLine 3\n')
|
||||
expect(result.lineCount).toBe(4)
|
||||
expect(result.totalLines).toBe(4)
|
||||
})
|
||||
|
||||
test('fast path — GBK file', async () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
// "你好世界" in GBK + newline
|
||||
const gbkBytes = Buffer.from([
|
||||
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0x0a,
|
||||
])
|
||||
fs.writeFileSync(filePath, gbkBytes)
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('你好世界\n')
|
||||
expect(result.totalBytes).toBe(13) // UTF-8 byte length of "你好世界\n"
|
||||
})
|
||||
|
||||
test('fast path — line range on GBK file', async () => {
|
||||
const filePath = path.join(tmpDir, 'gbk-lines.txt')
|
||||
// Three lines in GBK: "第一行\n第二行\n第三行\n"
|
||||
const line1 = Buffer.from([0xb5, 0xda, 0xd2, 0xbb, 0xd0, 0xd0]) // 第一行
|
||||
const line2 = Buffer.from([0xb5, 0xda, 0xb6, 0xfe, 0xd0, 0xd0]) // 第二行
|
||||
const line3 = Buffer.from([0xb5, 0xda, 0xc8, 0xfd, 0xd0, 0xd0]) // 第三行
|
||||
const content = Buffer.concat([
|
||||
line1,
|
||||
Buffer.from([0x0a]),
|
||||
line2,
|
||||
Buffer.from([0x0a]),
|
||||
line3,
|
||||
Buffer.from([0x0a]),
|
||||
])
|
||||
fs.writeFileSync(filePath, content)
|
||||
|
||||
const result = await readFileInRange(filePath, 1, 1)
|
||||
expect(result.content).toBe('第二行')
|
||||
})
|
||||
|
||||
test('BOM stripping', async () => {
|
||||
const filePath = path.join(tmpDir, 'bom.txt')
|
||||
const bom = Buffer.from([0xef, 0xbb, 0xbf])
|
||||
fs.writeFileSync(filePath, Buffer.concat([bom, Buffer.from('Hello\n')]))
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('Hello\n')
|
||||
})
|
||||
|
||||
test('empty file', async () => {
|
||||
const filePath = path.join(tmpDir, 'empty.txt')
|
||||
fs.writeFileSync(filePath, '')
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('')
|
||||
expect(result.totalLines).toBe(1)
|
||||
expect(result.totalBytes).toBe(0)
|
||||
})
|
||||
|
||||
test('fast path — offset and maxLines', async () => {
|
||||
const filePath = path.join(tmpDir, 'lines.txt')
|
||||
fs.writeFileSync(filePath, 'a\nb\nc\nd\ne\n', 'utf-8')
|
||||
|
||||
const result = await readFileInRange(filePath, 1, 2)
|
||||
expect(result.content).toBe('b\nc')
|
||||
expect(result.lineCount).toBe(2)
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user