From 17c06690d89736a0c33dedb6a837359f5becbadc Mon Sep 17 00:00:00 2001
From: claude-code-best <claude-code-best@proton.me>
Date: Sun, 10 May 2026 22:08:52 +0800
Subject: [PATCH] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E9=9D=9E=20UTF-8=20?=
 =?UTF-8?q?=E7=BC=96=E7=A0=81=E6=96=87=E4=BB=B6=E8=AF=BB=E5=86=99=20round-?=
 =?UTF-8?q?trip=20=E5=AD=97=E8=8A=82=E6=8D=9F=E5=9D=8F?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GBK 文件编辑后被错误写入为 UTF-8（Buffer.from 不支持 gbk 编码，
encodeString 静默 fallback），latin1/ANSI 文件 0x80-0x9F 范围字节因
TextDecoder('latin1') 与 Buffer.from('latin1') 编解码不对称而被篡改。

修复：latin1 解码改用严格 ISO-8859-1 映射保证与 Buffer.from 对称；
GBK 编码通过 TextDecoder 反向构建查找表实现零依赖编码器。

Co-Authored-By: glm-5-turbo <zai-org@claude-code-best.win>
---
 src/utils/__tests__/encoding.test.ts |  82 +++++++++++++++++++--
 src/utils/encoding.ts                | 102 ++++++++++++++++++++++++++-
 2 files changed, 176 insertions(+), 8 deletions(-)

diff --git a/src/utils/__tests__/encoding.test.ts b/src/utils/__tests__/encoding.test.ts
index 69b6f4d26..19e274ece 100644
--- a/src/utils/__tests__/encoding.test.ts
+++ b/src/utils/__tests__/encoding.test.ts
@@ -70,6 +70,17 @@ describe('decodeBuffer', () => {
     const buf = Buffer.alloc(0)
     expect(decodeBuffer(buf, 'utf-8')).toBe('')
   })
+
+  test('decodes latin1 using strict ISO-8859-1 mapping', () => {
+    // 0x80 should decode to U+0080 (control char), NOT € (U+20AC)
+    const buf = Buffer.from([0x80, 0x85, 0x9c, 0xa0, 0xff])
+    const decoded = decodeBuffer(buf, 'latin1')
+    expect(decoded.charCodeAt(0)).toBe(0x80)
+    expect(decoded.charCodeAt(1)).toBe(0x85)
+    expect(decoded.charCodeAt(2)).toBe(0x9c)
+    expect(decoded.charCodeAt(3)).toBe(0xa0)
+    expect(decoded.charCodeAt(4)).toBe(0xff)
+  })
 })
 
 describe('encodeString', () => {
@@ -91,12 +102,71 @@ describe('encodeString', () => {
     expect(decodeBuffer(buffer, 'utf-16le')).toBe('Hello')
   })
 
-  test('handles GBK encoding (may convert)', () => {
+  test('encodes GBK string correctly', () => {
     const { buffer, converted } = encodeString('你好', 'gbk')
-    expect(buffer).toBeInstanceOf(Buffer)
-    expect(typeof converted).toBe('boolean')
-    if (!converted) {
-      expect(decodeBuffer(buffer, 'gbk')).toBe('你好')
-    }
+    expect(converted).toBe(false)
+    expect(buffer.toString('hex')).toBe('c4e3bac3')
+  })
+
+  test('GBK round-trip preserves bytes', () => {
+    // "测试文件" in GBK
+    const original = Buffer.from([
+      0xb2, 0xe2, 0xca, 0xd4, 0xce, 0xc4, 0xbc, 0xfe,
+    ])
+    const decoded = decodeBuffer(original, 'gbk')
+    const { buffer } = encodeString(decoded, 'gbk')
+    expect(buffer.equals(original)).toBe(true)
+  })
+
+  test('GBK encoding handles mixed ASCII and CJK', () => {
+    // "Hello你好" in GBK: 48 65 6c 6c 6f c4 e3 ba c3
+    const { buffer, converted } = encodeString('Hello你好', 'gbk')
+    expect(converted).toBe(false)
+    expect(buffer.toString('hex')).toBe('48656c6c6fc4e3bac3')
+  })
+
+  test('latin1 round-trip preserves all byte values', () => {
+    // Test the full 0x80-0xFF range that previously broke
+    const bytes = Buffer.from([
+      0x80, 0x81, 0x85, 0x8c, 0x9c, 0xa0, 0xc0, 0xe9, 0xf6, 0xfc, 0xff,
+    ])
+    const decoded = decodeBuffer(bytes, 'latin1')
+    const { buffer } = encodeString(decoded, 'latin1')
+    expect(buffer.equals(bytes)).toBe(true)
+  })
+
+  test('latin1 encoding does not set converted flag', () => {
+    const { buffer, converted } = encodeString('test\x80\x90', 'latin1')
+    expect(converted).toBe(false)
+    expect(buffer.toString('hex')).toBe('746573748090')
+  })
+})
+
+describe('round-trip consistency', () => {
+  test('GBK file survives full read-decode-encode cycle', () => {
+    const original = Buffer.from([0xc4, 0xe3, 0xba, 0xc3, 0x0d, 0x0a])
+    const enc = detectEncoding(original)
+    expect(enc).toBe('gbk')
+    const decoded = decodeBuffer(original, enc)
+    const { buffer } = encodeString(decoded, enc)
+    expect(buffer.equals(original)).toBe(true)
+  })
+
+  test('latin1 file survives full read-decode-encode cycle', () => {
+    const original = Buffer.from([0x80, 0x90, 0xa0, 0xff, 0x41, 0x42])
+    const enc = detectEncoding(original)
+    expect(enc).toBe('latin1')
+    const decoded = decodeBuffer(original, enc)
+    const { buffer } = encodeString(decoded, enc)
+    expect(buffer.equals(original)).toBe(true)
+  })
+
+  test('UTF-8 file survives full read-decode-encode cycle', () => {
+    const original = Buffer.from('Hello 世界', 'utf-8')
+    const enc = detectEncoding(original)
+    expect(enc).toBe('utf-8')
+    const decoded = decodeBuffer(original, enc)
+    const { buffer } = encodeString(decoded, enc)
+    expect(buffer.equals(original)).toBe(true)
   })
 })
diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts
index 3a4b15216..88557b10a 100644
--- a/src/utils/encoding.ts
+++ b/src/utils/encoding.ts
@@ -12,6 +12,88 @@ export type FileEncoding = BufferEncoding | 'gbk'
 /** Encoding name accepted by TextDecoder (string), broader than FileEncoding */
 export type DetectedEncoding = string
 
+// ---------------------------------------------------------------------------
+// GBK encode table — built once at module load via TextDecoder reverse lookup.
+// Maps Unicode codepoint → [leadByte, trailByte] for every valid 2-byte GBK
+// sequence. Single-byte ASCII (0x00-0x7F) needs no entry; those pass through
+// Buffer.from directly.
+// ---------------------------------------------------------------------------
+
+const gbkEncodeMap = new Map<number, [number, number]>()
+let gbkTableBuilt = false
+
+/**
+ * Build the GBK encode map by iterating every valid 2-byte GBK sequence
+ * (lead 0x81-0xFE, trail 0x40-0xFE excluding 0x7F) and recording the
+ * resulting Unicode codepoint from TextDecoder.
+ */
+function ensureGbkTable(): void {
+  if (gbkTableBuilt) return
+  gbkTableBuilt = true
+
+  const decoder = new TextDecoder('gbk', { fatal: true })
+  const twoByteBuf = Buffer.alloc(2)
+
+  for (let lead = 0x81; lead <= 0xfe; lead++) {
+    for (let trail = 0x40; trail <= 0xfe; trail++) {
+      if (trail === 0x7f) continue
+      twoByteBuf[0] = lead
+      twoByteBuf[1] = trail
+      try {
+        const str = decoder.decode(twoByteBuf)
+        const cp = str.charCodeAt(0)
+        if (cp > 0x7f) {
+          gbkEncodeMap.set(cp, [lead, trail])
+        }
+      } catch {
+        // Invalid GBK sequence — skip
+      }
+    }
+  }
+}
+
+/**
+ * Encode a string to GBK bytes. ASCII chars (U+0000-U+007F) are copied as-is;
+ * CJK chars are looked up in the prebuilt table. Unencodable chars become '?'.
+ */
+function encodeGbk(str: string): Buffer {
+  ensureGbkTable()
+
+  // Pre-allocate: worst case is 2 bytes per char
+  const parts: Buffer[] = []
+  let asciiRun = ''
+
+  for (let i = 0; i < str.length; i++) {
+    const cp = str.charCodeAt(i)
+
+    if (cp <= 0x7f) {
+      asciiRun += str[i]
+      continue
+    }
+
+    // Flush ASCII run
+    if (asciiRun.length > 0) {
+      parts.push(Buffer.from(asciiRun, 'ascii'))
+      asciiRun = ''
+    }
+
+    const pair = gbkEncodeMap.get(cp)
+    if (pair) {
+      parts.push(Buffer.from(pair))
+    } else {
+      // Unencodable char — use '?' as fallback
+      parts.push(Buffer.from([0x3f]))
+    }
+  }
+
+  // Flush remaining ASCII
+  if (asciiRun.length > 0) {
+    parts.push(Buffer.from(asciiRun, 'ascii'))
+  }
+
+  return Buffer.concat(parts)
+}
+
 /**
  * Detect the encoding of a buffer using three-layer detection:
  * 1. BOM (Byte Order Mark) detection
@@ -53,11 +135,25 @@ export function detectEncoding(buffer: Buffer): FileEncoding {
 /**
  * Decode a buffer using the specified encoding.
  * Unified decoding entry point for all file read paths.
+ *
+ * For 'latin1', uses strict ISO-8859-1 mapping (byte N → U+00N) instead of
+ * TextDecoder('latin1'), because Bun's TextDecoder treats 'latin1' as
+ * Windows-1252 in the 0x80-0x9F range, which breaks round-trip through
+ * Buffer.from(str, 'latin1').
  */
 export function decodeBuffer(
   buffer: Buffer,
   encoding: DetectedEncoding,
 ): string {
+  if (encoding === 'latin1') {
+    // Strict ISO-8859-1: byte value = Unicode codepoint.
+    // This guarantees round-trip fidelity with Buffer.from(str, 'latin1').
+    let result = ''
+    for (let i = 0; i < buffer.length; i++) {
+      result += String.fromCharCode(buffer[i])
+    }
+    return result
+  }
   return new TextDecoder(encoding).decode(buffer)
 }
 
@@ -79,8 +175,10 @@ export function encodeString(
   if (encoding === 'utf-16le') {
     return { buffer: Buffer.from(content, 'utf-16le'), converted: false }
   }
-
-  // Other encodings (e.g. gbk): try Buffer.from, fall back to UTF-8
+  if (encoding === 'gbk') {
+    return { buffer: encodeGbk(content), converted: false }
+  }
+  // Buffer-supported encodings (latin1, ascii, binary, etc.)
   try {
     const buf = Buffer.from(content, encoding as BufferEncoding)
     return { buffer: buf, converted: false }