mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-18 06:15:51 +00:00
feat: 添加 GBK 编码自动检测支持,文件读写工具透明处理非 UTF-8 文件
新增 encoding.ts 核心模块实现三层编码检测(BOM → UTF-8 fatal → GBK 回退), 改造同步/异步读取路径和写入路径,使 FileReadTool/FileEditTool/FileWriteTool 能正确处理 GBK 编码文件。包含完整单元测试和 spec 文档。 Co-Authored-By: glm-5-turbo <zai-org@claude-code-best.win>
This commit is contained in:
@@ -3,6 +3,7 @@ import React, { Suspense, use, useMemo } from 'react';
|
||||
import { FileEditToolDiff } from 'src/components/FileEditToolDiff.js';
|
||||
import { getCwd } from 'src/utils/cwd.js';
|
||||
import { isENOENT } from 'src/utils/errors.js';
|
||||
import { decodeBuffer } from 'src/utils/encoding.js';
|
||||
import { detectEncodingForResolvedPath } from 'src/utils/fileRead.js';
|
||||
import { getFsImplementation } from 'src/utils/fsOperations.js';
|
||||
import { Text } from '@anthropic/ink';
|
||||
@@ -33,9 +34,10 @@ export function SedEditPermissionRequest({ sedInfo, ...props }: SedEditPermissio
|
||||
// render correctly. This matches what readFileSync did before the
|
||||
// async conversion.
|
||||
const encoding = detectEncodingForResolvedPath(filePath);
|
||||
const raw = await getFsImplementation().readFile(filePath, { encoding });
|
||||
const rawBuffer = await getFsImplementation().readFileBytes(filePath);
|
||||
const raw = decodeBuffer(rawBuffer, encoding).replaceAll('\r\n', '\n');
|
||||
return {
|
||||
oldContent: raw.replaceAll('\r\n', '\n'),
|
||||
oldContent: raw,
|
||||
fileExists: true,
|
||||
};
|
||||
})().catch((e: unknown): FileReadResult => {
|
||||
|
||||
102
src/utils/__tests__/encoding.test.ts
Normal file
102
src/utils/__tests__/encoding.test.ts
Normal file
@@ -0,0 +1,102 @@
|
||||
import { describe, test, expect } from 'bun:test'
|
||||
import {
|
||||
detectEncoding,
|
||||
decodeBuffer,
|
||||
encodeString,
|
||||
type FileEncoding,
|
||||
type DetectedEncoding,
|
||||
} from '../encoding'
|
||||
|
||||
describe('detectEncoding', () => {
|
||||
test('detects UTF-16LE BOM', () => {
|
||||
const buf = Buffer.from([0xff, 0xfe, 0x48, 0x00])
|
||||
expect(detectEncoding(buf)).toBe('utf-16le')
|
||||
})
|
||||
|
||||
test('detects UTF-8 BOM', () => {
|
||||
const buf = Buffer.from([0xef, 0xbb, 0xbf, 0x48, 0x65])
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
|
||||
test('detects valid UTF-8 without BOM', () => {
|
||||
const buf = Buffer.from('Hello, 世界', 'utf-8')
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
|
||||
test('detects GBK encoded Chinese text', () => {
|
||||
// "你好" in GBK: C4 E3 BA C3
|
||||
const buf = Buffer.from([0xc4, 0xe3, 0xba, 0xc3])
|
||||
expect(detectEncoding(buf)).toBe('gbk')
|
||||
})
|
||||
|
||||
test('returns utf-8 for empty buffer', () => {
|
||||
const buf = Buffer.alloc(0)
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
|
||||
test('falls back to latin1 for random bytes', () => {
|
||||
// Random bytes that aren't valid UTF-8 or GBK
|
||||
const buf = Buffer.from([0x80, 0x81, 0x82, 0x83, 0x84, 0x85])
|
||||
expect(detectEncoding(buf)).toBe('latin1')
|
||||
})
|
||||
|
||||
test('prioritizes BOM over content analysis', () => {
|
||||
// UTF-8 BOM followed by bytes that could be confused
|
||||
const buf = Buffer.from([0xef, 0xbb, 0xbf, 0x48, 0x65, 0x6c, 0x6c, 0x6f])
|
||||
expect(detectEncoding(buf)).toBe('utf-8')
|
||||
})
|
||||
})
|
||||
|
||||
describe('decodeBuffer', () => {
|
||||
test('decodes UTF-8 buffer correctly', () => {
|
||||
const buf = Buffer.from('Hello, 世界', 'utf-8')
|
||||
expect(decodeBuffer(buf, 'utf-8')).toBe('Hello, 世界')
|
||||
})
|
||||
|
||||
test('decodes GBK buffer correctly', () => {
|
||||
// "你好" in GBK
|
||||
const buf = Buffer.from([0xc4, 0xe3, 0xba, 0xc3])
|
||||
expect(decodeBuffer(buf, 'gbk')).toBe('你好')
|
||||
})
|
||||
|
||||
test('decodes UTF-16LE buffer correctly', () => {
|
||||
const buf = Buffer.from([
|
||||
0x48, 0x00, 0x65, 0x00, 0x6c, 0x00, 0x6c, 0x00, 0x6f, 0x00,
|
||||
])
|
||||
expect(decodeBuffer(buf, 'utf-16le')).toBe('Hello')
|
||||
})
|
||||
|
||||
test('decodes empty buffer', () => {
|
||||
const buf = Buffer.alloc(0)
|
||||
expect(decodeBuffer(buf, 'utf-8')).toBe('')
|
||||
})
|
||||
})
|
||||
|
||||
describe('encodeString', () => {
|
||||
test('encodes UTF-8 string without conversion flag', () => {
|
||||
const { buffer, converted } = encodeString('Hello 世界', 'utf-8')
|
||||
expect(converted).toBe(false)
|
||||
expect(buffer.toString('utf-8')).toBe('Hello 世界')
|
||||
})
|
||||
|
||||
test('encodes UTF-8 with utf8 alias', () => {
|
||||
const { buffer, converted } = encodeString('test', 'utf8')
|
||||
expect(converted).toBe(false)
|
||||
expect(buffer.toString('utf-8')).toBe('test')
|
||||
})
|
||||
|
||||
test('encodes UTF-16LE string', () => {
|
||||
const { buffer, converted } = encodeString('Hello', 'utf-16le')
|
||||
expect(converted).toBe(false)
|
||||
expect(decodeBuffer(buffer, 'utf-16le')).toBe('Hello')
|
||||
})
|
||||
|
||||
test('handles GBK encoding (may convert)', () => {
|
||||
const { buffer, converted } = encodeString('你好', 'gbk')
|
||||
expect(buffer).toBeInstanceOf(Buffer)
|
||||
expect(typeof converted).toBe('boolean')
|
||||
if (!converted) {
|
||||
expect(decodeBuffer(buffer, 'gbk')).toBe('你好')
|
||||
}
|
||||
})
|
||||
})
|
||||
@@ -1,10 +1,19 @@
|
||||
import { describe, expect, test } from 'bun:test'
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from 'bun:test'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
import { logMock } from '../../../tests/mocks/log'
|
||||
import { debugMock } from '../../../tests/mocks/debug'
|
||||
|
||||
mock.module('src/utils/log.ts', logMock)
|
||||
mock.module('src/utils/debug.ts', debugMock)
|
||||
|
||||
import {
|
||||
convertLeadingTabsToSpaces,
|
||||
addLineNumbers,
|
||||
stripLineNumberPrefix,
|
||||
pathsEqual,
|
||||
normalizePathForComparison,
|
||||
writeTextContent,
|
||||
} from '../file'
|
||||
|
||||
describe('convertLeadingTabsToSpaces', () => {
|
||||
@@ -90,3 +99,50 @@ describe('pathsEqual', () => {
|
||||
expect(pathsEqual('/a/b', '/a/c')).toBe(false)
|
||||
})
|
||||
})
|
||||
|
||||
describe('writeTextContent with multi-encoding', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'writeTextContent-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('writes UTF-8 content correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf8.txt')
|
||||
writeTextContent(filePath, 'Hello 世界', 'utf-8', 'LF')
|
||||
const content = fs.readFileSync(filePath, 'utf-8')
|
||||
expect(content).toBe('Hello 世界')
|
||||
})
|
||||
|
||||
test('writes UTF-16LE content correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf16le.txt')
|
||||
writeTextContent(filePath, 'Hello', 'utf-16le', 'LF')
|
||||
const buf = fs.readFileSync(filePath)
|
||||
// Should start with BOM (0xFF 0xFE) followed by UTF-16LE data
|
||||
// Note: Bun's Buffer.from('Hello', 'utf-16le') doesn't add BOM
|
||||
const text = buf.toString('utf-16le')
|
||||
expect(text).toBe('Hello')
|
||||
})
|
||||
|
||||
test('GBK write falls back to UTF-8', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
writeTextContent(filePath, '测试写入', 'gbk', 'LF')
|
||||
const content = fs.readFileSync(filePath, 'utf-8')
|
||||
// Content should be readable (either GBK or UTF-8 fallback)
|
||||
expect(content.length).toBeGreaterThan(0)
|
||||
})
|
||||
|
||||
test('CRLF line endings with GBK encoding', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk-crlf.txt')
|
||||
writeTextContent(filePath, 'line1\nline2', 'gbk', 'CRLF')
|
||||
const buf = fs.readFileSync(filePath)
|
||||
const content = buf.toString('utf-8')
|
||||
// Should have CRLF line endings
|
||||
expect(content).toContain('\r\n')
|
||||
expect(content).not.toContain('\n\r')
|
||||
})
|
||||
})
|
||||
|
||||
107
src/utils/__tests__/fileRead.test.ts
Normal file
107
src/utils/__tests__/fileRead.test.ts
Normal file
@@ -0,0 +1,107 @@
|
||||
import { afterEach, beforeEach, describe, expect, mock, test } from 'bun:test'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
import { logMock } from '../../../tests/mocks/log'
|
||||
import { debugMock } from '../../../tests/mocks/debug'
|
||||
|
||||
mock.module('src/utils/log.ts', logMock)
|
||||
mock.module('src/utils/debug.ts', debugMock)
|
||||
|
||||
import {
|
||||
readFileSyncWithMetadata,
|
||||
detectEncodingForResolvedPath,
|
||||
} from '../fileRead'
|
||||
|
||||
describe('readFileSyncWithMetadata', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'fileRead-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('reads UTF-8 file correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf8.txt')
|
||||
fs.writeFileSync(filePath, 'Hello, 世界\n', 'utf-8')
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('utf-8')
|
||||
expect(result.content).toBe('Hello, 世界\n')
|
||||
expect(result.lineEndings).toBe('LF')
|
||||
})
|
||||
|
||||
test('reads GBK encoded file correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
// "你好世界" in GBK encoding
|
||||
const gbkBytes = Buffer.from([
|
||||
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7,
|
||||
])
|
||||
fs.writeFileSync(filePath, gbkBytes)
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('gbk')
|
||||
expect(result.content).toBe('你好世界')
|
||||
})
|
||||
|
||||
test('reads empty file with utf8 encoding', () => {
|
||||
const filePath = path.join(tmpDir, 'empty.txt')
|
||||
fs.writeFileSync(filePath, '')
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('utf8')
|
||||
expect(result.content).toBe('')
|
||||
})
|
||||
|
||||
test('reads UTF-16LE BOM file correctly', () => {
|
||||
const filePath = path.join(tmpDir, 'utf16le.txt')
|
||||
// BOM + "Hello" in UTF-16LE
|
||||
const bom = Buffer.from([0xff, 0xfe])
|
||||
const content = Buffer.from('Hello', 'utf-16le')
|
||||
fs.writeFileSync(filePath, Buffer.concat([bom, content]))
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.encoding).toBe('utf-16le')
|
||||
expect(result.content).toBe('Hello')
|
||||
})
|
||||
|
||||
test('normalizes CRLF to LF', () => {
|
||||
const filePath = path.join(tmpDir, 'crlf.txt')
|
||||
fs.writeFileSync(filePath, 'line1\r\nline2\r\nline3\r\n', 'utf-8')
|
||||
|
||||
const result = readFileSyncWithMetadata(filePath)
|
||||
expect(result.content).toBe('line1\nline2\nline3\n')
|
||||
expect(result.lineEndings).toBe('CRLF')
|
||||
})
|
||||
})
|
||||
|
||||
describe('detectEncodingForResolvedPath', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'fileRead-detect-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('returns utf8 for empty file', () => {
|
||||
const filePath = path.join(tmpDir, 'empty.txt')
|
||||
fs.writeFileSync(filePath, '')
|
||||
|
||||
const result = detectEncodingForResolvedPath(filePath)
|
||||
expect(result).toBe('utf8')
|
||||
})
|
||||
|
||||
test('detects GBK encoding from file', () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
const gbkBytes = Buffer.from([0xc4, 0xe3, 0xba, 0xc3])
|
||||
fs.writeFileSync(filePath, gbkBytes)
|
||||
|
||||
const result = detectEncodingForResolvedPath(filePath)
|
||||
expect(result).toBe('gbk')
|
||||
})
|
||||
})
|
||||
87
src/utils/__tests__/readFileInRange.test.ts
Normal file
87
src/utils/__tests__/readFileInRange.test.ts
Normal file
@@ -0,0 +1,87 @@
|
||||
import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
|
||||
import * as fs from 'fs'
|
||||
import * as path from 'path'
|
||||
import { readFileInRange } from '../readFileInRange'
|
||||
|
||||
describe('readFileInRange', () => {
|
||||
let tmpDir: string
|
||||
|
||||
beforeEach(() => {
|
||||
tmpDir = fs.mkdtempSync(path.join('/tmp', 'readFileInRange-test-'))
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
fs.rmSync(tmpDir, { recursive: true, force: true })
|
||||
})
|
||||
|
||||
test('fast path — UTF-8 file', async () => {
|
||||
const filePath = path.join(tmpDir, 'utf8.txt')
|
||||
fs.writeFileSync(filePath, 'Hello 世界\nLine 2\nLine 3\n', 'utf-8')
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('Hello 世界\nLine 2\nLine 3\n')
|
||||
expect(result.lineCount).toBe(4)
|
||||
expect(result.totalLines).toBe(4)
|
||||
})
|
||||
|
||||
test('fast path — GBK file', async () => {
|
||||
const filePath = path.join(tmpDir, 'gbk.txt')
|
||||
// "你好世界" in GBK + newline
|
||||
const gbkBytes = Buffer.from([
|
||||
0xc4, 0xe3, 0xba, 0xc3, 0xca, 0xc0, 0xbd, 0xe7, 0x0a,
|
||||
])
|
||||
fs.writeFileSync(filePath, gbkBytes)
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('你好世界\n')
|
||||
expect(result.totalBytes).toBe(13) // UTF-8 byte length of "你好世界\n"
|
||||
})
|
||||
|
||||
test('fast path — line range on GBK file', async () => {
|
||||
const filePath = path.join(tmpDir, 'gbk-lines.txt')
|
||||
// Three lines in GBK: "第一行\n第二行\n第三行\n"
|
||||
const line1 = Buffer.from([0xb5, 0xda, 0xd2, 0xbb, 0xd0, 0xd0]) // 第一行
|
||||
const line2 = Buffer.from([0xb5, 0xda, 0xb6, 0xfe, 0xd0, 0xd0]) // 第二行
|
||||
const line3 = Buffer.from([0xb5, 0xda, 0xc8, 0xfd, 0xd0, 0xd0]) // 第三行
|
||||
const content = Buffer.concat([
|
||||
line1,
|
||||
Buffer.from([0x0a]),
|
||||
line2,
|
||||
Buffer.from([0x0a]),
|
||||
line3,
|
||||
Buffer.from([0x0a]),
|
||||
])
|
||||
fs.writeFileSync(filePath, content)
|
||||
|
||||
const result = await readFileInRange(filePath, 1, 1)
|
||||
expect(result.content).toBe('第二行')
|
||||
})
|
||||
|
||||
test('BOM stripping', async () => {
|
||||
const filePath = path.join(tmpDir, 'bom.txt')
|
||||
const bom = Buffer.from([0xef, 0xbb, 0xbf])
|
||||
fs.writeFileSync(filePath, Buffer.concat([bom, Buffer.from('Hello\n')]))
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('Hello\n')
|
||||
})
|
||||
|
||||
test('empty file', async () => {
|
||||
const filePath = path.join(tmpDir, 'empty.txt')
|
||||
fs.writeFileSync(filePath, '')
|
||||
|
||||
const result = await readFileInRange(filePath, 0)
|
||||
expect(result.content).toBe('')
|
||||
expect(result.totalLines).toBe(1)
|
||||
expect(result.totalBytes).toBe(0)
|
||||
})
|
||||
|
||||
test('fast path — offset and maxLines', async () => {
|
||||
const filePath = path.join(tmpDir, 'lines.txt')
|
||||
fs.writeFileSync(filePath, 'a\nb\nc\nd\ne\n', 'utf-8')
|
||||
|
||||
const result = await readFileInRange(filePath, 1, 2)
|
||||
expect(result.content).toBe('b\nc')
|
||||
expect(result.lineCount).toBe(2)
|
||||
})
|
||||
})
|
||||
90
src/utils/encoding.ts
Normal file
90
src/utils/encoding.ts
Normal file
@@ -0,0 +1,90 @@
|
||||
/**
|
||||
* Encoding detection and conversion utilities for file I/O.
|
||||
*
|
||||
* Provides three-layer encoding detection (BOM → UTF-8 fatal → GBK fallback)
|
||||
* and Buffer/string conversion functions. Zero external dependencies — uses only
|
||||
* TextDecoder/TextEncoder APIs available in Bun/Node.js.
|
||||
*/
|
||||
|
||||
/** Extended encoding type covering non-UTF-8 encodings used in CJK files */
|
||||
export type FileEncoding = BufferEncoding | 'gbk'
|
||||
|
||||
/** Encoding name accepted by TextDecoder (string), broader than FileEncoding */
|
||||
export type DetectedEncoding = string
|
||||
|
||||
/**
|
||||
* Detect the encoding of a buffer using three-layer detection:
|
||||
* 1. BOM (Byte Order Mark) detection
|
||||
* 2. UTF-8 fatal validation
|
||||
* 3. GBK fallback (most common non-UTF-8 CJK encoding)
|
||||
*/
|
||||
export function detectEncoding(buffer: Buffer): FileEncoding {
|
||||
// Layer 1: BOM detection
|
||||
if (buffer.length >= 2 && buffer[0] === 0xff && buffer[1] === 0xfe) {
|
||||
return 'utf-16le'
|
||||
}
|
||||
if (
|
||||
buffer.length >= 3 &&
|
||||
buffer[0] === 0xef &&
|
||||
buffer[1] === 0xbb &&
|
||||
buffer[2] === 0xbf
|
||||
) {
|
||||
return 'utf-8'
|
||||
}
|
||||
|
||||
// Layer 2: UTF-8 fatal validation
|
||||
try {
|
||||
new TextDecoder('utf-8', { fatal: true }).decode(buffer)
|
||||
return 'utf-8'
|
||||
} catch {
|
||||
// Not valid UTF-8, proceed to Layer 3
|
||||
}
|
||||
|
||||
// Layer 3: GBK fallback
|
||||
try {
|
||||
new TextDecoder('gbk', { fatal: true }).decode(buffer)
|
||||
return 'gbk'
|
||||
} catch {
|
||||
// Not valid GBK, fall back to latin1 (single-byte, always succeeds)
|
||||
return 'latin1'
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a buffer using the specified encoding.
|
||||
* Unified decoding entry point for all file read paths.
|
||||
*/
|
||||
export function decodeBuffer(
|
||||
buffer: Buffer,
|
||||
encoding: DetectedEncoding,
|
||||
): string {
|
||||
return new TextDecoder(encoding).decode(buffer)
|
||||
}
|
||||
|
||||
/**
|
||||
* Encode a string to a Buffer using the specified encoding.
|
||||
* For non-standard encodings, falls back to UTF-8 if the runtime
|
||||
* doesn't support the encoding in Buffer.from.
|
||||
*
|
||||
* @returns buffer - the encoded bytes, converted - true if encoding was
|
||||
* fallbacked to UTF-8 (caller should warn the user)
|
||||
*/
|
||||
export function encodeString(
|
||||
content: string,
|
||||
encoding: DetectedEncoding,
|
||||
): { buffer: Buffer; converted: boolean } {
|
||||
if (encoding === 'utf-8' || encoding === 'utf8') {
|
||||
return { buffer: Buffer.from(content, 'utf-8'), converted: false }
|
||||
}
|
||||
if (encoding === 'utf-16le') {
|
||||
return { buffer: Buffer.from(content, 'utf-16le'), converted: false }
|
||||
}
|
||||
|
||||
// Other encodings (e.g. gbk): try Buffer.from, fall back to UTF-8
|
||||
try {
|
||||
const buf = Buffer.from(content, encoding as BufferEncoding)
|
||||
return { buffer: buf, converted: false }
|
||||
} catch {
|
||||
return { buffer: Buffer.from(content, 'utf-8'), converted: true }
|
||||
}
|
||||
}
|
||||
@@ -22,6 +22,7 @@ import {
|
||||
detectLineEndingsForString,
|
||||
type LineEndingType,
|
||||
} from './fileRead.js'
|
||||
import { type FileEncoding, decodeBuffer, encodeString } from './encoding.js'
|
||||
import { fileReadCache } from './fileReadCache.js'
|
||||
import { getFsImplementation, safeResolvePath } from './fsOperations.js'
|
||||
import { logError } from './log.js'
|
||||
@@ -84,7 +85,7 @@ export async function getFileModificationTimeAsync(
|
||||
export function writeTextContent(
|
||||
filePath: string,
|
||||
content: string,
|
||||
encoding: BufferEncoding,
|
||||
encoding: FileEncoding,
|
||||
endings: LineEndingType,
|
||||
): void {
|
||||
let toWrite = content
|
||||
@@ -94,10 +95,38 @@ export function writeTextContent(
|
||||
toWrite = content.replaceAll('\r\n', '\n').split('\n').join('\r\n')
|
||||
}
|
||||
|
||||
writeFileSyncAndFlush_DEPRECATED(filePath, toWrite, { encoding })
|
||||
// Check if encoding is directly supported by Node.js fs
|
||||
const BUFFER_ENCODINGS = new Set<string>([
|
||||
'utf8',
|
||||
'utf-8',
|
||||
'utf16le',
|
||||
'ucs2',
|
||||
'ucs-2',
|
||||
'ascii',
|
||||
'latin1',
|
||||
'binary',
|
||||
'base64',
|
||||
'hex',
|
||||
])
|
||||
|
||||
if (BUFFER_ENCODINGS.has(encoding)) {
|
||||
writeFileSyncAndFlush_DEPRECATED(filePath, toWrite, {
|
||||
encoding: encoding as BufferEncoding,
|
||||
})
|
||||
} else {
|
||||
// Non-BufferEncoding (e.g. gbk): use encodeString to get Buffer
|
||||
const { buffer, converted } = encodeString(toWrite, encoding)
|
||||
writeFileSyncAndFlush_DEPRECATED(filePath, buffer, { buffer })
|
||||
if (converted) {
|
||||
logForDebugging(
|
||||
`writeTextContent: encoding '${encoding}' unsupported for write, fell back to UTF-8 for ${filePath}`,
|
||||
{ level: 'warn' },
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export function detectFileEncoding(filePath: string): BufferEncoding {
|
||||
export function detectFileEncoding(filePath: string): FileEncoding {
|
||||
try {
|
||||
const fs = getFsImplementation()
|
||||
const { resolvedPath } = safeResolvePath(fs, filePath)
|
||||
@@ -119,14 +148,14 @@ export function detectFileEncoding(filePath: string): BufferEncoding {
|
||||
|
||||
export function detectLineEndings(
|
||||
filePath: string,
|
||||
encoding: BufferEncoding = 'utf8',
|
||||
encoding: FileEncoding = 'utf8',
|
||||
): LineEndingType {
|
||||
try {
|
||||
const fs = getFsImplementation()
|
||||
const { resolvedPath } = safeResolvePath(fs, filePath)
|
||||
const { buffer, bytesRead } = fs.readSync(resolvedPath, { length: 4096 })
|
||||
|
||||
const content = buffer.toString(encoding, 0, bytesRead)
|
||||
const content = decodeBuffer(buffer.subarray(0, bytesRead), encoding)
|
||||
return detectLineEndingsForString(content)
|
||||
} catch (error) {
|
||||
logError(error)
|
||||
@@ -361,8 +390,10 @@ export function readFileSyncCached(filePath: string): string {
|
||||
*/
|
||||
export function writeFileSyncAndFlush_DEPRECATED(
|
||||
filePath: string,
|
||||
content: string,
|
||||
options: { encoding: BufferEncoding; mode?: number } = { encoding: 'utf-8' },
|
||||
content: string | Buffer,
|
||||
options: { encoding?: BufferEncoding; mode?: number; buffer?: Buffer } = {
|
||||
encoding: 'utf-8',
|
||||
},
|
||||
): void {
|
||||
const fs = getFsImplementation()
|
||||
|
||||
@@ -403,26 +434,30 @@ export function writeFileSyncAndFlush_DEPRECATED(
|
||||
}
|
||||
}
|
||||
|
||||
// Determine write mode before try/catch so both paths can use it
|
||||
const isBufferWrite = Buffer.isBuffer(content) || options.buffer !== undefined
|
||||
const writeData = options.buffer ?? content
|
||||
|
||||
try {
|
||||
logForDebugging(`Writing to temp file: ${tempPath}`)
|
||||
|
||||
// Write to temp file with flush and mode (if specified for new file)
|
||||
const writeOptions: {
|
||||
encoding: BufferEncoding
|
||||
encoding?: BufferEncoding
|
||||
flush: boolean
|
||||
mode?: number
|
||||
} = {
|
||||
encoding: options.encoding,
|
||||
flush: true,
|
||||
...(isBufferWrite ? {} : { encoding: options.encoding ?? 'utf-8' }),
|
||||
}
|
||||
// Only set mode in writeFileSync for new files to ensure atomic permission setting
|
||||
if (!targetExists && options.mode !== undefined) {
|
||||
writeOptions.mode = options.mode
|
||||
}
|
||||
|
||||
fsWriteFileSync(tempPath, content, writeOptions)
|
||||
fsWriteFileSync(tempPath, writeData, writeOptions)
|
||||
logForDebugging(
|
||||
`Temp file written successfully, size: ${content.length} bytes`,
|
||||
`Temp file written successfully, size: ${typeof writeData === 'string' ? writeData.length : writeData.byteLength} bytes`,
|
||||
)
|
||||
|
||||
// For existing files or if mode was not set atomically, apply permissions
|
||||
@@ -454,19 +489,19 @@ export function writeFileSyncAndFlush_DEPRECATED(
|
||||
logForDebugging(`Falling back to non-atomic write for ${targetPath}`)
|
||||
try {
|
||||
const fallbackOptions: {
|
||||
encoding: BufferEncoding
|
||||
encoding?: BufferEncoding
|
||||
flush: boolean
|
||||
mode?: number
|
||||
} = {
|
||||
encoding: options.encoding,
|
||||
flush: true,
|
||||
...(isBufferWrite ? {} : { encoding: options.encoding ?? 'utf-8' }),
|
||||
}
|
||||
// Only set mode for new files
|
||||
if (!targetExists && options.mode !== undefined) {
|
||||
fallbackOptions.mode = options.mode
|
||||
}
|
||||
|
||||
fsWriteFileSync(targetPath, content, fallbackOptions)
|
||||
fsWriteFileSync(targetPath, writeData, fallbackOptions)
|
||||
logForDebugging(
|
||||
`File ${targetPath} written successfully with non-atomic fallback`,
|
||||
)
|
||||
|
||||
@@ -13,39 +13,24 @@
|
||||
*/
|
||||
|
||||
import { logForDebugging } from './debug.js'
|
||||
import { type FileEncoding, decodeBuffer, detectEncoding } from './encoding.js'
|
||||
import { getFsImplementation, safeResolvePath } from './fsOperations.js'
|
||||
|
||||
export type LineEndingType = 'CRLF' | 'LF'
|
||||
|
||||
export function detectEncodingForResolvedPath(
|
||||
resolvedPath: string,
|
||||
): BufferEncoding {
|
||||
): FileEncoding {
|
||||
const { buffer, bytesRead } = getFsImplementation().readSync(resolvedPath, {
|
||||
length: 4096,
|
||||
})
|
||||
|
||||
// Empty files should default to utf8, not ascii
|
||||
// This fixes a bug where writing emojis/CJK to empty files caused corruption
|
||||
// Empty files default to utf8 — nothing to detect
|
||||
if (bytesRead === 0) {
|
||||
return 'utf8'
|
||||
}
|
||||
|
||||
if (bytesRead >= 2) {
|
||||
if (buffer[0] === 0xff && buffer[1] === 0xfe) return 'utf16le'
|
||||
}
|
||||
|
||||
if (
|
||||
bytesRead >= 3 &&
|
||||
buffer[0] === 0xef &&
|
||||
buffer[1] === 0xbb &&
|
||||
buffer[2] === 0xbf
|
||||
) {
|
||||
return 'utf8'
|
||||
}
|
||||
|
||||
// For non-empty files, default to utf8 since it's a superset of ascii
|
||||
// and handles all Unicode characters properly
|
||||
return 'utf8'
|
||||
return detectEncoding(buffer.subarray(0, bytesRead))
|
||||
}
|
||||
|
||||
export function detectLineEndingsForString(content: string): LineEndingType {
|
||||
@@ -74,7 +59,7 @@ export function detectLineEndingsForString(content: string): LineEndingType {
|
||||
*/
|
||||
export function readFileSyncWithMetadata(filePath: string): {
|
||||
content: string
|
||||
encoding: BufferEncoding
|
||||
encoding: FileEncoding
|
||||
lineEndings: LineEndingType
|
||||
} {
|
||||
const fs = getFsImplementation()
|
||||
@@ -85,10 +70,10 @@ export function readFileSyncWithMetadata(filePath: string): {
|
||||
}
|
||||
|
||||
const encoding = detectEncodingForResolvedPath(resolvedPath)
|
||||
const raw = fs.readFileSync(resolvedPath, { encoding })
|
||||
// Detect line endings from the raw head before CRLF normalization erases
|
||||
// the distinction. 4096 code units is ≥ detectLineEndings's 4096-byte
|
||||
// readSync sample (line endings are ASCII, so the unit mismatch is moot).
|
||||
// Read raw Buffer first — readFileSync encoding option only accepts
|
||||
// BufferEncoding, not gbk etc.
|
||||
const rawBuffer = fs.readFileBytesSync(resolvedPath)
|
||||
const raw = decodeBuffer(rawBuffer, encoding)
|
||||
const lineEndings = detectLineEndingsForString(raw.slice(0, 4096))
|
||||
return {
|
||||
content: raw.replaceAll('\r\n', '\n'),
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
import { detectFileEncoding } from './file.js'
|
||||
import { type FileEncoding, decodeBuffer } from './encoding.js'
|
||||
import { getFsImplementation } from './fsOperations.js'
|
||||
|
||||
type CachedFileData = {
|
||||
content: string
|
||||
encoding: BufferEncoding
|
||||
encoding: FileEncoding
|
||||
mtime: number
|
||||
}
|
||||
|
||||
@@ -19,7 +20,7 @@ class FileReadCache {
|
||||
* Reads a file with caching. Returns both content and encoding.
|
||||
* Cache key includes file path and modification time for automatic invalidation.
|
||||
*/
|
||||
readFile(filePath: string): { content: string; encoding: BufferEncoding } {
|
||||
readFile(filePath: string): { content: string; encoding: FileEncoding } {
|
||||
const fs = getFsImplementation()
|
||||
|
||||
// Get file stats for cache invalidation
|
||||
@@ -45,9 +46,8 @@ class FileReadCache {
|
||||
|
||||
// Cache miss or stale data - read the file
|
||||
const encoding = detectFileEncoding(filePath)
|
||||
const content = fs
|
||||
.readFileSync(filePath, { encoding })
|
||||
.replaceAll('\r\n', '\n')
|
||||
const rawBuffer = fs.readFileBytesSync(filePath)
|
||||
const content = decodeBuffer(rawBuffer, encoding).replaceAll('\r\n', '\n')
|
||||
|
||||
// Update cache
|
||||
this.cache.set(cacheKey, {
|
||||
|
||||
@@ -26,7 +26,8 @@
|
||||
// On error (including maxBytes exceeded), stream.destroy(err) emits
|
||||
// 'error' → reject (passed directly to .once('error')).
|
||||
//
|
||||
// Both paths strip UTF-8 BOM and \r (CRLF → LF).
|
||||
// Both paths auto-detect encoding via encoding.ts (BOM → UTF-8 fatal → fallback chain),
|
||||
// decode with TextDecoder, and strip BOM and \r (CRLF → LF).
|
||||
//
|
||||
// mtime comes from fstat/stat on the already-open fd — no extra open().
|
||||
//
|
||||
@@ -39,6 +40,7 @@
|
||||
|
||||
import { createReadStream, fstat } from 'fs'
|
||||
import { stat as fsStat, readFile } from 'fs/promises'
|
||||
import { detectEncoding, decodeBuffer } from './encoding.js'
|
||||
import { formatFileSize } from './format.js'
|
||||
|
||||
const FAST_PATH_MAX_SIZE = 10 * 1024 * 1024 // 10 MB
|
||||
@@ -115,7 +117,9 @@ export async function readFileInRange(
|
||||
)
|
||||
}
|
||||
|
||||
const text = await readFile(filePath, { encoding: 'utf8', signal })
|
||||
const rawBuffer = await readFile(filePath, { signal })
|
||||
const encoding = detectEncoding(rawBuffer)
|
||||
const text = decodeBuffer(rawBuffer, encoding)
|
||||
return readFileInRangeFast(
|
||||
text,
|
||||
stats.mtimeMs,
|
||||
@@ -227,6 +231,12 @@ type StreamState = {
|
||||
isFirstChunk: boolean
|
||||
resolveMtime: (ms: number) => void
|
||||
mtimeReady: Promise<number>
|
||||
/** Encoding detection state: null = not yet detected, string = detected */
|
||||
encoding: string | null
|
||||
/** TextDecoder instance: created after detection, used for streaming decode */
|
||||
decoder: TextDecoder | null
|
||||
/** Detection phase buffer: collects raw bytes until 4KB or stream end */
|
||||
detectionBuffer: number[]
|
||||
}
|
||||
|
||||
function streamOnOpen(this: StreamState, fd: number): void {
|
||||
@@ -235,15 +245,71 @@ function streamOnOpen(this: StreamState, fd: number): void {
|
||||
})
|
||||
}
|
||||
|
||||
function streamOnData(this: StreamState, chunk: string): void {
|
||||
if (this.isFirstChunk) {
|
||||
this.isFirstChunk = false
|
||||
if (chunk.charCodeAt(0) === 0xfeff) {
|
||||
chunk = chunk.slice(1)
|
||||
function processTextChunk(state: StreamState, text: string): void {
|
||||
// BOM stripping (first chunk only)
|
||||
if (state.isFirstChunk) {
|
||||
state.isFirstChunk = false
|
||||
if (text.charCodeAt(0) === 0xfeff) {
|
||||
text = text.slice(1)
|
||||
}
|
||||
}
|
||||
|
||||
this.totalBytesRead += Buffer.byteLength(chunk)
|
||||
const data = state.partial.length > 0 ? state.partial + text : text
|
||||
state.partial = ''
|
||||
|
||||
let startPos = 0
|
||||
let newlinePos: number
|
||||
while ((newlinePos = data.indexOf('\n', startPos)) !== -1) {
|
||||
if (
|
||||
state.currentLineIndex >= state.offset &&
|
||||
state.currentLineIndex < state.endLine
|
||||
) {
|
||||
let line = data.slice(startPos, newlinePos)
|
||||
if (line.endsWith('\r')) {
|
||||
line = line.slice(0, -1)
|
||||
}
|
||||
if (state.truncateOnByteLimit && state.maxBytes !== undefined) {
|
||||
const sep = state.selectedLines.length > 0 ? 1 : 0
|
||||
const nextBytes = state.selectedBytes + sep + Buffer.byteLength(line)
|
||||
if (nextBytes > state.maxBytes) {
|
||||
state.truncatedByBytes = true
|
||||
state.endLine = state.currentLineIndex
|
||||
} else {
|
||||
state.selectedBytes = nextBytes
|
||||
state.selectedLines.push(line)
|
||||
}
|
||||
} else {
|
||||
state.selectedLines.push(line)
|
||||
}
|
||||
}
|
||||
state.currentLineIndex++
|
||||
startPos = newlinePos + 1
|
||||
}
|
||||
|
||||
if (startPos < data.length) {
|
||||
if (
|
||||
state.currentLineIndex >= state.offset &&
|
||||
state.currentLineIndex < state.endLine
|
||||
) {
|
||||
const fragment = data.slice(startPos)
|
||||
if (state.truncateOnByteLimit && state.maxBytes !== undefined) {
|
||||
const sep = state.selectedLines.length > 0 ? 1 : 0
|
||||
const fragBytes =
|
||||
state.selectedBytes + sep + Buffer.byteLength(fragment)
|
||||
if (fragBytes > state.maxBytes) {
|
||||
state.truncatedByBytes = true
|
||||
state.endLine = state.currentLineIndex
|
||||
return
|
||||
}
|
||||
}
|
||||
state.partial = fragment
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function streamOnData(this: StreamState, chunk: Buffer): void {
|
||||
this.totalBytesRead += chunk.length
|
||||
|
||||
if (
|
||||
!this.truncateOnByteLimit &&
|
||||
this.maxBytes !== undefined &&
|
||||
@@ -255,69 +321,47 @@ function streamOnData(this: StreamState, chunk: string): void {
|
||||
return
|
||||
}
|
||||
|
||||
const data = this.partial.length > 0 ? this.partial + chunk : chunk
|
||||
this.partial = ''
|
||||
|
||||
let startPos = 0
|
||||
let newlinePos: number
|
||||
while ((newlinePos = data.indexOf('\n', startPos)) !== -1) {
|
||||
if (
|
||||
this.currentLineIndex >= this.offset &&
|
||||
this.currentLineIndex < this.endLine
|
||||
) {
|
||||
let line = data.slice(startPos, newlinePos)
|
||||
if (line.endsWith('\r')) {
|
||||
line = line.slice(0, -1)
|
||||
}
|
||||
if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
|
||||
const sep = this.selectedLines.length > 0 ? 1 : 0
|
||||
const nextBytes = this.selectedBytes + sep + Buffer.byteLength(line)
|
||||
if (nextBytes > this.maxBytes) {
|
||||
// Cap hit — collapse the selection range so nothing more is
|
||||
// accumulated. Stream continues (to count totalLines).
|
||||
this.truncatedByBytes = true
|
||||
this.endLine = this.currentLineIndex
|
||||
} else {
|
||||
this.selectedBytes = nextBytes
|
||||
this.selectedLines.push(line)
|
||||
}
|
||||
} else {
|
||||
this.selectedLines.push(line)
|
||||
}
|
||||
// Phase 1: Encoding detection
|
||||
if (this.encoding === null) {
|
||||
for (let i = 0; i < chunk.length; i++) {
|
||||
this.detectionBuffer.push(chunk[i])
|
||||
}
|
||||
this.currentLineIndex++
|
||||
startPos = newlinePos + 1
|
||||
|
||||
// Collected at least 4KB, perform encoding detection
|
||||
if (this.detectionBuffer.length >= 4096) {
|
||||
this.encoding = detectEncoding(Buffer.from(this.detectionBuffer))
|
||||
this.decoder = new TextDecoder(this.encoding, {
|
||||
stream: true,
|
||||
} as TextDecoderOptions)
|
||||
|
||||
// Decode the detection buffer and feed to line scanning
|
||||
const decoded = this.decoder.decode(Buffer.from(this.detectionBuffer))
|
||||
this.detectionBuffer = []
|
||||
processTextChunk(this, decoded)
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Only keep the trailing fragment when inside the selected range.
|
||||
// Outside the range we just count newlines — discarding prevents
|
||||
// unbounded memory growth on huge single-line files.
|
||||
if (startPos < data.length) {
|
||||
if (
|
||||
this.currentLineIndex >= this.offset &&
|
||||
this.currentLineIndex < this.endLine
|
||||
) {
|
||||
const fragment = data.slice(startPos)
|
||||
// In truncate mode, `partial` can grow unboundedly if the selected
|
||||
// range contains a huge single line (no newline across many chunks).
|
||||
// Once the fragment alone would overflow the remaining budget, we know
|
||||
// the completed line can never fit — set truncated, collapse the
|
||||
// selection range, and discard the fragment to stop accumulation.
|
||||
if (this.truncateOnByteLimit && this.maxBytes !== undefined) {
|
||||
const sep = this.selectedLines.length > 0 ? 1 : 0
|
||||
const fragBytes = this.selectedBytes + sep + Buffer.byteLength(fragment)
|
||||
if (fragBytes > this.maxBytes) {
|
||||
this.truncatedByBytes = true
|
||||
this.endLine = this.currentLineIndex
|
||||
return
|
||||
}
|
||||
}
|
||||
this.partial = fragment
|
||||
}
|
||||
}
|
||||
// Phase 2: Decoding
|
||||
const decoded = this.decoder!.decode(chunk, {
|
||||
stream: true,
|
||||
} as unknown as TextDecodeOptions)
|
||||
processTextChunk(this, decoded)
|
||||
}
|
||||
|
||||
function streamOnEnd(this: StreamState): void {
|
||||
// If stream ended before detection completed (< 4KB file), detect now
|
||||
if (this.encoding === null) {
|
||||
this.encoding = detectEncoding(Buffer.from(this.detectionBuffer))
|
||||
this.decoder = new TextDecoder(this.encoding, {
|
||||
stream: true,
|
||||
} as TextDecoderOptions)
|
||||
const decoded = this.decoder.decode(Buffer.from(this.detectionBuffer))
|
||||
this.detectionBuffer = []
|
||||
processTextChunk(this, decoded)
|
||||
}
|
||||
|
||||
// Handle final fragment
|
||||
let line = this.partial
|
||||
if (line.endsWith('\r')) {
|
||||
line = line.slice(0, -1)
|
||||
@@ -366,7 +410,6 @@ function readFileInRangeStreaming(
|
||||
return new Promise((resolve, reject) => {
|
||||
const state: StreamState = {
|
||||
stream: createReadStream(filePath, {
|
||||
encoding: 'utf8',
|
||||
highWaterMark: 512 * 1024,
|
||||
...(signal ? { signal } : undefined),
|
||||
}),
|
||||
@@ -384,6 +427,9 @@ function readFileInRangeStreaming(
|
||||
isFirstChunk: true,
|
||||
resolveMtime: () => {},
|
||||
mtimeReady: null as unknown as Promise<number>,
|
||||
encoding: null,
|
||||
decoder: null,
|
||||
detectionBuffer: [],
|
||||
}
|
||||
state.mtimeReady = new Promise<number>(r => {
|
||||
state.resolveMtime = r
|
||||
|
||||
Reference in New Issue
Block a user