From 17c06690d89736a0c33dedb6a837359f5becbadc Mon Sep 17 00:00:00 2001 From: claude-code-best Date: Sun, 10 May 2026 22:08:52 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=9D=9E=20UTF-8=20?= =?UTF-8?q?=E7=BC=96=E7=A0=81=E6=96=87=E4=BB=B6=E8=AF=BB=E5=86=99=20round-?= =?UTF-8?q?trip=20=E5=AD=97=E8=8A=82=E6=8D=9F=E5=9D=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit GBK 文件编辑后被错误写入为 UTF-8(Buffer.from 不支持 gbk 编码, encodeString 静默 fallback),latin1/ANSI 文件 0x80-0x9F 范围字节因 TextDecoder('latin1') 与 Buffer.from('latin1') 编解码不对称而被篡改。 修复:latin1 解码改用严格 ISO-8859-1 映射保证与 Buffer.from 对称; GBK 编码通过 TextDecoder 反向构建查找表实现零依赖编码器。 Co-Authored-By: glm-5-turbo --- src/utils/__tests__/encoding.test.ts | 82 +++++++++++++++++++-- src/utils/encoding.ts | 102 ++++++++++++++++++++++++++- 2 files changed, 176 insertions(+), 8 deletions(-) diff --git a/src/utils/__tests__/encoding.test.ts b/src/utils/__tests__/encoding.test.ts index 69b6f4d26..19e274ece 100644 --- a/src/utils/__tests__/encoding.test.ts +++ b/src/utils/__tests__/encoding.test.ts @@ -70,6 +70,17 @@ describe('decodeBuffer', () => { const buf = Buffer.alloc(0) expect(decodeBuffer(buf, 'utf-8')).toBe('') }) + + test('decodes latin1 using strict ISO-8859-1 mapping', () => { + // 0x80 should decode to U+0080 (control char), NOT € (U+20AC) + const buf = Buffer.from([0x80, 0x85, 0x9c, 0xa0, 0xff]) + const decoded = decodeBuffer(buf, 'latin1') + expect(decoded.charCodeAt(0)).toBe(0x80) + expect(decoded.charCodeAt(1)).toBe(0x85) + expect(decoded.charCodeAt(2)).toBe(0x9c) + expect(decoded.charCodeAt(3)).toBe(0xa0) + expect(decoded.charCodeAt(4)).toBe(0xff) + }) }) describe('encodeString', () => { @@ -91,12 +102,71 @@ describe('encodeString', () => { expect(decodeBuffer(buffer, 'utf-16le')).toBe('Hello') }) - test('handles GBK encoding (may convert)', () => { + test('encodes GBK string correctly', () => { const { buffer, converted } = encodeString('你好', 'gbk') - expect(buffer).toBeInstanceOf(Buffer) - expect(typeof converted).toBe('boolean') - if (!converted) { - expect(decodeBuffer(buffer, 'gbk')).toBe('你好') - } + expect(converted).toBe(false) + expect(buffer.toString('hex')).toBe('c4e3bac3') + }) + + test('GBK round-trip preserves bytes', () => { + // "测试文件" in GBK + const original = Buffer.from([ + 0xb2, 0xe2, 0xca, 0xd4, 0xce, 0xc4, 0xbc, 0xfe, + ]) + const decoded = decodeBuffer(original, 'gbk') + const { buffer } = encodeString(decoded, 'gbk') + expect(buffer.equals(original)).toBe(true) + }) + + test('GBK encoding handles mixed ASCII and CJK', () => { + // "Hello你好" in GBK: 48 65 6c 6c 6f c4 e3 ba c3 + const { buffer, converted } = encodeString('Hello你好', 'gbk') + expect(converted).toBe(false) + expect(buffer.toString('hex')).toBe('48656c6c6fc4e3bac3') + }) + + test('latin1 round-trip preserves all byte values', () => { + // Test the full 0x80-0xFF range that previously broke + const bytes = Buffer.from([ + 0x80, 0x81, 0x85, 0x8c, 0x9c, 0xa0, 0xc0, 0xe9, 0xf6, 0xfc, 0xff, + ]) + const decoded = decodeBuffer(bytes, 'latin1') + const { buffer } = encodeString(decoded, 'latin1') + expect(buffer.equals(bytes)).toBe(true) + }) + + test('latin1 encoding does not set converted flag', () => { + const { buffer, converted } = encodeString('test\x80\x90', 'latin1') + expect(converted).toBe(false) + expect(buffer.toString('hex')).toBe('746573748090') + }) +}) + +describe('round-trip consistency', () => { + test('GBK file survives full read-decode-encode cycle', () => { + const original = Buffer.from([0xc4, 0xe3, 0xba, 0xc3, 0x0d, 0x0a]) + const enc = detectEncoding(original) + expect(enc).toBe('gbk') + const decoded = decodeBuffer(original, enc) + const { buffer } = encodeString(decoded, enc) + expect(buffer.equals(original)).toBe(true) + }) + + test('latin1 file survives full read-decode-encode cycle', () => { + const original = Buffer.from([0x80, 0x90, 0xa0, 0xff, 0x41, 0x42]) + const enc = detectEncoding(original) + expect(enc).toBe('latin1') + const decoded = decodeBuffer(original, enc) + const { buffer } = encodeString(decoded, enc) + expect(buffer.equals(original)).toBe(true) + }) + + test('UTF-8 file survives full read-decode-encode cycle', () => { + const original = Buffer.from('Hello 世界', 'utf-8') + const enc = detectEncoding(original) + expect(enc).toBe('utf-8') + const decoded = decodeBuffer(original, enc) + const { buffer } = encodeString(decoded, enc) + expect(buffer.equals(original)).toBe(true) }) }) diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts index 3a4b15216..88557b10a 100644 --- a/src/utils/encoding.ts +++ b/src/utils/encoding.ts @@ -12,6 +12,88 @@ export type FileEncoding = BufferEncoding | 'gbk' /** Encoding name accepted by TextDecoder (string), broader than FileEncoding */ export type DetectedEncoding = string +// --------------------------------------------------------------------------- +// GBK encode table — built once at module load via TextDecoder reverse lookup. +// Maps Unicode codepoint → [leadByte, trailByte] for every valid 2-byte GBK +// sequence. Single-byte ASCII (0x00-0x7F) needs no entry; those pass through +// Buffer.from directly. +// --------------------------------------------------------------------------- + +const gbkEncodeMap = new Map() +let gbkTableBuilt = false + +/** + * Build the GBK encode map by iterating every valid 2-byte GBK sequence + * (lead 0x81-0xFE, trail 0x40-0xFE excluding 0x7F) and recording the + * resulting Unicode codepoint from TextDecoder. + */ +function ensureGbkTable(): void { + if (gbkTableBuilt) return + gbkTableBuilt = true + + const decoder = new TextDecoder('gbk', { fatal: true }) + const twoByteBuf = Buffer.alloc(2) + + for (let lead = 0x81; lead <= 0xfe; lead++) { + for (let trail = 0x40; trail <= 0xfe; trail++) { + if (trail === 0x7f) continue + twoByteBuf[0] = lead + twoByteBuf[1] = trail + try { + const str = decoder.decode(twoByteBuf) + const cp = str.charCodeAt(0) + if (cp > 0x7f) { + gbkEncodeMap.set(cp, [lead, trail]) + } + } catch { + // Invalid GBK sequence — skip + } + } + } +} + +/** + * Encode a string to GBK bytes. ASCII chars (U+0000-U+007F) are copied as-is; + * CJK chars are looked up in the prebuilt table. Unencodable chars become '?'. + */ +function encodeGbk(str: string): Buffer { + ensureGbkTable() + + // Pre-allocate: worst case is 2 bytes per char + const parts: Buffer[] = [] + let asciiRun = '' + + for (let i = 0; i < str.length; i++) { + const cp = str.charCodeAt(i) + + if (cp <= 0x7f) { + asciiRun += str[i] + continue + } + + // Flush ASCII run + if (asciiRun.length > 0) { + parts.push(Buffer.from(asciiRun, 'ascii')) + asciiRun = '' + } + + const pair = gbkEncodeMap.get(cp) + if (pair) { + parts.push(Buffer.from(pair)) + } else { + // Unencodable char — use '?' as fallback + parts.push(Buffer.from([0x3f])) + } + } + + // Flush remaining ASCII + if (asciiRun.length > 0) { + parts.push(Buffer.from(asciiRun, 'ascii')) + } + + return Buffer.concat(parts) +} + /** * Detect the encoding of a buffer using three-layer detection: * 1. BOM (Byte Order Mark) detection @@ -53,11 +135,25 @@ export function detectEncoding(buffer: Buffer): FileEncoding { /** * Decode a buffer using the specified encoding. * Unified decoding entry point for all file read paths. + * + * For 'latin1', uses strict ISO-8859-1 mapping (byte N → U+00N) instead of + * TextDecoder('latin1'), because Bun's TextDecoder treats 'latin1' as + * Windows-1252 in the 0x80-0x9F range, which breaks round-trip through + * Buffer.from(str, 'latin1'). */ export function decodeBuffer( buffer: Buffer, encoding: DetectedEncoding, ): string { + if (encoding === 'latin1') { + // Strict ISO-8859-1: byte value = Unicode codepoint. + // This guarantees round-trip fidelity with Buffer.from(str, 'latin1'). + let result = '' + for (let i = 0; i < buffer.length; i++) { + result += String.fromCharCode(buffer[i]) + } + return result + } return new TextDecoder(encoding).decode(buffer) } @@ -79,8 +175,10 @@ export function encodeString( if (encoding === 'utf-16le') { return { buffer: Buffer.from(content, 'utf-16le'), converted: false } } - - // Other encodings (e.g. gbk): try Buffer.from, fall back to UTF-8 + if (encoding === 'gbk') { + return { buffer: encodeGbk(content), converted: false } + } + // Buffer-supported encodings (latin1, ascii, binary, etc.) try { const buf = Buffer.from(content, encoding as BufferEncoding) return { buffer: buf, converted: false }