mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-15 12:55:51 +00:00
Revert "fix: 修复非 UTF-8 编码文件读写 round-trip 字节损坏"
This reverts commit 17c06690d8.
This commit is contained in:
@@ -70,17 +70,6 @@ describe('decodeBuffer', () => {
|
|||||||
const buf = Buffer.alloc(0)
|
const buf = Buffer.alloc(0)
|
||||||
expect(decodeBuffer(buf, 'utf-8')).toBe('')
|
expect(decodeBuffer(buf, 'utf-8')).toBe('')
|
||||||
})
|
})
|
||||||
|
|
||||||
test('decodes latin1 using strict ISO-8859-1 mapping', () => {
|
|
||||||
// 0x80 should decode to U+0080 (control char), NOT € (U+20AC)
|
|
||||||
const buf = Buffer.from([0x80, 0x85, 0x9c, 0xa0, 0xff])
|
|
||||||
const decoded = decodeBuffer(buf, 'latin1')
|
|
||||||
expect(decoded.charCodeAt(0)).toBe(0x80)
|
|
||||||
expect(decoded.charCodeAt(1)).toBe(0x85)
|
|
||||||
expect(decoded.charCodeAt(2)).toBe(0x9c)
|
|
||||||
expect(decoded.charCodeAt(3)).toBe(0xa0)
|
|
||||||
expect(decoded.charCodeAt(4)).toBe(0xff)
|
|
||||||
})
|
|
||||||
})
|
})
|
||||||
|
|
||||||
describe('encodeString', () => {
|
describe('encodeString', () => {
|
||||||
@@ -102,71 +91,12 @@ describe('encodeString', () => {
|
|||||||
expect(decodeBuffer(buffer, 'utf-16le')).toBe('Hello')
|
expect(decodeBuffer(buffer, 'utf-16le')).toBe('Hello')
|
||||||
})
|
})
|
||||||
|
|
||||||
test('encodes GBK string correctly', () => {
|
test('handles GBK encoding (may convert)', () => {
|
||||||
const { buffer, converted } = encodeString('你好', 'gbk')
|
const { buffer, converted } = encodeString('你好', 'gbk')
|
||||||
expect(converted).toBe(false)
|
expect(buffer).toBeInstanceOf(Buffer)
|
||||||
expect(buffer.toString('hex')).toBe('c4e3bac3')
|
expect(typeof converted).toBe('boolean')
|
||||||
})
|
if (!converted) {
|
||||||
|
expect(decodeBuffer(buffer, 'gbk')).toBe('你好')
|
||||||
test('GBK round-trip preserves bytes', () => {
|
}
|
||||||
// "测试文件" in GBK
|
|
||||||
const original = Buffer.from([
|
|
||||||
0xb2, 0xe2, 0xca, 0xd4, 0xce, 0xc4, 0xbc, 0xfe,
|
|
||||||
])
|
|
||||||
const decoded = decodeBuffer(original, 'gbk')
|
|
||||||
const { buffer } = encodeString(decoded, 'gbk')
|
|
||||||
expect(buffer.equals(original)).toBe(true)
|
|
||||||
})
|
|
||||||
|
|
||||||
test('GBK encoding handles mixed ASCII and CJK', () => {
|
|
||||||
// "Hello你好" in GBK: 48 65 6c 6c 6f c4 e3 ba c3
|
|
||||||
const { buffer, converted } = encodeString('Hello你好', 'gbk')
|
|
||||||
expect(converted).toBe(false)
|
|
||||||
expect(buffer.toString('hex')).toBe('48656c6c6fc4e3bac3')
|
|
||||||
})
|
|
||||||
|
|
||||||
test('latin1 round-trip preserves all byte values', () => {
|
|
||||||
// Test the full 0x80-0xFF range that previously broke
|
|
||||||
const bytes = Buffer.from([
|
|
||||||
0x80, 0x81, 0x85, 0x8c, 0x9c, 0xa0, 0xc0, 0xe9, 0xf6, 0xfc, 0xff,
|
|
||||||
])
|
|
||||||
const decoded = decodeBuffer(bytes, 'latin1')
|
|
||||||
const { buffer } = encodeString(decoded, 'latin1')
|
|
||||||
expect(buffer.equals(bytes)).toBe(true)
|
|
||||||
})
|
|
||||||
|
|
||||||
test('latin1 encoding does not set converted flag', () => {
|
|
||||||
const { buffer, converted } = encodeString('test\x80\x90', 'latin1')
|
|
||||||
expect(converted).toBe(false)
|
|
||||||
expect(buffer.toString('hex')).toBe('746573748090')
|
|
||||||
})
|
|
||||||
})
|
|
||||||
|
|
||||||
describe('round-trip consistency', () => {
|
|
||||||
test('GBK file survives full read-decode-encode cycle', () => {
|
|
||||||
const original = Buffer.from([0xc4, 0xe3, 0xba, 0xc3, 0x0d, 0x0a])
|
|
||||||
const enc = detectEncoding(original)
|
|
||||||
expect(enc).toBe('gbk')
|
|
||||||
const decoded = decodeBuffer(original, enc)
|
|
||||||
const { buffer } = encodeString(decoded, enc)
|
|
||||||
expect(buffer.equals(original)).toBe(true)
|
|
||||||
})
|
|
||||||
|
|
||||||
test('latin1 file survives full read-decode-encode cycle', () => {
|
|
||||||
const original = Buffer.from([0x80, 0x90, 0xa0, 0xff, 0x41, 0x42])
|
|
||||||
const enc = detectEncoding(original)
|
|
||||||
expect(enc).toBe('latin1')
|
|
||||||
const decoded = decodeBuffer(original, enc)
|
|
||||||
const { buffer } = encodeString(decoded, enc)
|
|
||||||
expect(buffer.equals(original)).toBe(true)
|
|
||||||
})
|
|
||||||
|
|
||||||
test('UTF-8 file survives full read-decode-encode cycle', () => {
|
|
||||||
const original = Buffer.from('Hello 世界', 'utf-8')
|
|
||||||
const enc = detectEncoding(original)
|
|
||||||
expect(enc).toBe('utf-8')
|
|
||||||
const decoded = decodeBuffer(original, enc)
|
|
||||||
const { buffer } = encodeString(decoded, enc)
|
|
||||||
expect(buffer.equals(original)).toBe(true)
|
|
||||||
})
|
})
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -12,88 +12,6 @@ export type FileEncoding = BufferEncoding | 'gbk'
|
|||||||
/** Encoding name accepted by TextDecoder (string), broader than FileEncoding */
|
/** Encoding name accepted by TextDecoder (string), broader than FileEncoding */
|
||||||
export type DetectedEncoding = string
|
export type DetectedEncoding = string
|
||||||
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
// GBK encode table — built once at module load via TextDecoder reverse lookup.
|
|
||||||
// Maps Unicode codepoint → [leadByte, trailByte] for every valid 2-byte GBK
|
|
||||||
// sequence. Single-byte ASCII (0x00-0x7F) needs no entry; those pass through
|
|
||||||
// Buffer.from directly.
|
|
||||||
// ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
const gbkEncodeMap = new Map<number, [number, number]>()
|
|
||||||
let gbkTableBuilt = false
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Build the GBK encode map by iterating every valid 2-byte GBK sequence
|
|
||||||
* (lead 0x81-0xFE, trail 0x40-0xFE excluding 0x7F) and recording the
|
|
||||||
* resulting Unicode codepoint from TextDecoder.
|
|
||||||
*/
|
|
||||||
function ensureGbkTable(): void {
|
|
||||||
if (gbkTableBuilt) return
|
|
||||||
gbkTableBuilt = true
|
|
||||||
|
|
||||||
const decoder = new TextDecoder('gbk', { fatal: true })
|
|
||||||
const twoByteBuf = Buffer.alloc(2)
|
|
||||||
|
|
||||||
for (let lead = 0x81; lead <= 0xfe; lead++) {
|
|
||||||
for (let trail = 0x40; trail <= 0xfe; trail++) {
|
|
||||||
if (trail === 0x7f) continue
|
|
||||||
twoByteBuf[0] = lead
|
|
||||||
twoByteBuf[1] = trail
|
|
||||||
try {
|
|
||||||
const str = decoder.decode(twoByteBuf)
|
|
||||||
const cp = str.charCodeAt(0)
|
|
||||||
if (cp > 0x7f) {
|
|
||||||
gbkEncodeMap.set(cp, [lead, trail])
|
|
||||||
}
|
|
||||||
} catch {
|
|
||||||
// Invalid GBK sequence — skip
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Encode a string to GBK bytes. ASCII chars (U+0000-U+007F) are copied as-is;
|
|
||||||
* CJK chars are looked up in the prebuilt table. Unencodable chars become '?'.
|
|
||||||
*/
|
|
||||||
function encodeGbk(str: string): Buffer {
|
|
||||||
ensureGbkTable()
|
|
||||||
|
|
||||||
// Pre-allocate: worst case is 2 bytes per char
|
|
||||||
const parts: Buffer[] = []
|
|
||||||
let asciiRun = ''
|
|
||||||
|
|
||||||
for (let i = 0; i < str.length; i++) {
|
|
||||||
const cp = str.charCodeAt(i)
|
|
||||||
|
|
||||||
if (cp <= 0x7f) {
|
|
||||||
asciiRun += str[i]
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// Flush ASCII run
|
|
||||||
if (asciiRun.length > 0) {
|
|
||||||
parts.push(Buffer.from(asciiRun, 'ascii'))
|
|
||||||
asciiRun = ''
|
|
||||||
}
|
|
||||||
|
|
||||||
const pair = gbkEncodeMap.get(cp)
|
|
||||||
if (pair) {
|
|
||||||
parts.push(Buffer.from(pair))
|
|
||||||
} else {
|
|
||||||
// Unencodable char — use '?' as fallback
|
|
||||||
parts.push(Buffer.from([0x3f]))
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Flush remaining ASCII
|
|
||||||
if (asciiRun.length > 0) {
|
|
||||||
parts.push(Buffer.from(asciiRun, 'ascii'))
|
|
||||||
}
|
|
||||||
|
|
||||||
return Buffer.concat(parts)
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Detect the encoding of a buffer using three-layer detection:
|
* Detect the encoding of a buffer using three-layer detection:
|
||||||
* 1. BOM (Byte Order Mark) detection
|
* 1. BOM (Byte Order Mark) detection
|
||||||
@@ -135,25 +53,11 @@ export function detectEncoding(buffer: Buffer): FileEncoding {
|
|||||||
/**
|
/**
|
||||||
* Decode a buffer using the specified encoding.
|
* Decode a buffer using the specified encoding.
|
||||||
* Unified decoding entry point for all file read paths.
|
* Unified decoding entry point for all file read paths.
|
||||||
*
|
|
||||||
* For 'latin1', uses strict ISO-8859-1 mapping (byte N → U+00N) instead of
|
|
||||||
* TextDecoder('latin1'), because Bun's TextDecoder treats 'latin1' as
|
|
||||||
* Windows-1252 in the 0x80-0x9F range, which breaks round-trip through
|
|
||||||
* Buffer.from(str, 'latin1').
|
|
||||||
*/
|
*/
|
||||||
export function decodeBuffer(
|
export function decodeBuffer(
|
||||||
buffer: Buffer,
|
buffer: Buffer,
|
||||||
encoding: DetectedEncoding,
|
encoding: DetectedEncoding,
|
||||||
): string {
|
): string {
|
||||||
if (encoding === 'latin1') {
|
|
||||||
// Strict ISO-8859-1: byte value = Unicode codepoint.
|
|
||||||
// This guarantees round-trip fidelity with Buffer.from(str, 'latin1').
|
|
||||||
let result = ''
|
|
||||||
for (let i = 0; i < buffer.length; i++) {
|
|
||||||
result += String.fromCharCode(buffer[i])
|
|
||||||
}
|
|
||||||
return result
|
|
||||||
}
|
|
||||||
return new TextDecoder(encoding).decode(buffer)
|
return new TextDecoder(encoding).decode(buffer)
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -175,10 +79,8 @@ export function encodeString(
|
|||||||
if (encoding === 'utf-16le') {
|
if (encoding === 'utf-16le') {
|
||||||
return { buffer: Buffer.from(content, 'utf-16le'), converted: false }
|
return { buffer: Buffer.from(content, 'utf-16le'), converted: false }
|
||||||
}
|
}
|
||||||
if (encoding === 'gbk') {
|
|
||||||
return { buffer: encodeGbk(content), converted: false }
|
// Other encodings (e.g. gbk): try Buffer.from, fall back to UTF-8
|
||||||
}
|
|
||||||
// Buffer-supported encodings (latin1, ascii, binary, etc.)
|
|
||||||
try {
|
try {
|
||||||
const buf = Buffer.from(content, encoding as BufferEncoding)
|
const buf = Buffer.from(content, encoding as BufferEncoding)
|
||||||
return { buffer: buf, converted: false }
|
return { buffer: buf, converted: false }
|
||||||
|
|||||||
Reference in New Issue
Block a user