feat: add replacement support in single-byte encoders

ChALkeR · ChALkeR · commit 312746b52ade · 2026-01-23T02:30:52.000+04:00
diff --git a/README.md b/README.md
@@ -226,6 +226,8 @@ Returns a function `encode(string)` that encodes a string to bytes.
 In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
 not be encoded in the target encoding.
 
+In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?').
+
 #### `latin1toString(arr)`
 
 Decode `iso-8859-1` bytes to a string.
diff --git a/single-byte.js b/single-byte.js
@@ -61,38 +61,59 @@ export function createSinglebyteDecoder(encoding, loose = false) {
 
 const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
 
-function encode(s, m) {
+function encode(s, m, loose) {
   const len = s.length
   const x = new Uint8Array(len)
   let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)
 
-  for (const len3 = len - 3; i < len3; i += 4) {
+  if (!m || m.length < 256) return null // perf
+  const len3 = len - 3
+  while (i < len3) {
     const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
     const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
-    if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
+    if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break
 
     x[i] = c0
     x[i + 1] = c1
     x[i + 2] = c2
     x[i + 3] = c3
+    i += 4
   }
 
   for (; i < len; i++) {
     const x0 = s.charCodeAt(i)
     const c0 = m[x0]
-    if (!c0 && x0) return null
+    if (!c0 && x0) break
     x[i] = c0
   }
 
-  return x
+  if (i === len) return x
+  if (!loose) return null
+  let j = i
+  while (i < len) {
+    const x0 = s.charCodeAt(i++)
+    if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
+      if (i < len) {
+        const x1 = s.charCodeAt(i)
+        if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
+      }
+      x[j++] = 63 // '?'
+    } else {
+      const c0 = m[x0]
+      x[j++] = !c0 && x0 ? 63 : c0
+    }
+
+  }
+
+  return j === len ? x : x.subarray(0, j)
 }
 
 // fromBase64+btoa path is faster on everything where fromBase64 is fast
 const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb
 
 export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
-  // TODO: replacement, truncate (replacement will need varying length)
-  if (mode !== 'fatal') throw new Error('Unsupported mode')
+  const loose = mode === 'replacement'
+  if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
   const m = encodeMap(encoding) // asserts
   const isLatin1 = encoding === 'iso-8859-1'
 
@@ -106,24 +127,21 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
       if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
         try {
           return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
-        } catch {
-          throw new TypeError(E_STRICT)
-        }
+        } catch {}
+      } else if (!NON_LATIN.test(s)) {
+        return encodeLatin1(s)
       }
 
-      if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
-      return encodeLatin1(s)
-    }
-
-    // Instead of an ASCII regex check, encode optimistically - this is faster
-    // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
-    if (nativeEncoder && !NON_LATIN.test(s)) {
+      if (!loose) throw new TypeError(E_STRICT)
+    } else if (nativeEncoder && !NON_LATIN.test(s)) {
+      // Instead of an ASCII regex check, encode optimistically - this is faster
+      // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
       try {
         return encodeAscii(s, E_STRICT)
       } catch {}
     }
 
-    const res = encode(s, m)
+    const res = encode(s, m, loose)
     if (!res) throw new TypeError(E_STRICT)
     return res
   }
diff --git a/single-byte.node.js b/single-byte.node.js
@@ -61,54 +61,79 @@ export function createSinglebyteDecoder(encoding, loose = false) {
 
 const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
 
-function encode(s, m) {
+function encode(s, m, loose) {
   const len = s.length
   let i = 0
   const b = Buffer.from(s, 'utf-16le') // aligned
   if (!isLE) b.swap16()
   const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
-  for (const len3 = len - 3; i < len3; i += 4) {
+  if (!m || m.length < 256) return null // perf
+  const len3 = len - 3
+  while (i < len3) {
     const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
     const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
-    if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
+    if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break
     x[i] = c0
     x[i + 1] = c1
     x[i + 2] = c2
     x[i + 3] = c3
+    i += 4
   }
 
+  const mlen = m.length
   for (; i < len; i++) {
     const x0 = x[i]
+    if (x0 >= mlen) break
     const c0 = m[x0]
-    if (!c0 && x0) return null
+    if (!c0 && x0) break
     x[i] = c0
   }
 
-  return new Uint8Array(x)
+  if (i === len) return new Uint8Array(x)
+  if (!loose) return null
+  let j = i
+  while (i < len) {
+    const x0 = x[i++]
+    if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
+      if (i < len) {
+        const x1 = x[i]
+        if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
+      }
+      x[j++] = 63 // '?'
+    } else if (x0 >= mlen) {
+      x[j++] = 63 // '?'
+    } else {
+      const c0 = m[x0]
+      x[j++] = !c0 && x0 ? 63 : c0
+    }
+  }
+
+  return new Uint8Array(j === len ? x : x.subarray(0, j))
 }
 
 export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
-  // TODO: replacement, truncate (replacement will need varying length)
-  if (mode !== 'fatal') throw new Error('Unsupported mode')
+  const loose = mode === 'replacement'
+  if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
   const m = encodeMap(encoding) // asserts
   const isLatin1 = encoding === 'iso-8859-1'
 
   return (s) => {
     if (typeof s !== 'string') throw new TypeError(E_STRING)
     if (isLatin1) {
-      if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
-      const b = Buffer.from(s, 'latin1')
-      return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
-    }
-
-    // Instead of an ASCII regex check, encode optimistically - this is faster
-    // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
-    if (!NON_LATIN.test(s)) {
+      if (!NON_LATIN.test(s)) {
+        const b = Buffer.from(s, 'latin1')
+        return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
+      }
+
+      if (!loose) throw new TypeError(E_STRICT)
+    } else if (!NON_LATIN.test(s)) {
+      // Instead of an ASCII regex check, encode optimistically - this is faster
+      // Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
       const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
       if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
     }
 
-    const res = encode(s, m)
+    const res = encode(s, m, loose)
     if (!res) throw new TypeError(E_STRICT)
     return res
   }
diff --git a/tests/single-byte.test.js b/tests/single-byte.test.js
@@ -15,7 +15,9 @@ describe('single-byte encodings are supersets of ascii', () => {
   for (const encoding of encodings) {
     test(encoding, (t) => {
       const decoder = createSinglebyteDecoder(encoding)
+      const decoderLoose = createSinglebyteDecoder(encoding, true)
       const encoder = createSinglebyteEncoder(encoding)
+      const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
       for (let i = 0; i < 128; i++) {
         let str
         try {
@@ -27,7 +29,9 @@ describe('single-byte encodings are supersets of ascii', () => {
         t.assert.strictEqual(str.length, 1, i)
         t.assert.strictEqual(str.codePointAt(0), i, i)
 
+        t.assert.strictEqual(decoderLoose(Uint8Array.of(i)), str, i)
         t.assert.deepStrictEqual(encoder(str), Uint8Array.of(i))
+        t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(i))
       }
     })
   }
@@ -84,6 +88,7 @@ describe('single-byte encodings index: Unicode', () => {
       const decoder = createSinglebyteDecoder(encoding)
       const decoderLoose = createSinglebyteDecoder(encoding, true)
       const encoder = createSinglebyteEncoder(encoding)
+      const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
       const text = readFileSync(
         join(import.meta.dirname, 'encoding/fixtures/unicode/', fileName),
         'utf8'
@@ -145,6 +150,7 @@ describe('single-byte encodings index: Unicode', () => {
           t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte)))
 
           t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
+          t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
         }
       }
     })
@@ -158,6 +164,7 @@ describe('single-byte encodings index: WHATWG', () => {
       const decoder = createSinglebyteDecoder(encoding)
       const decoderLoose = createSinglebyteDecoder(encoding, true)
       const encoder = createSinglebyteEncoder(encoding)
+      const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
       const text = readFileSync(
         join(import.meta.dirname, 'encoding/fixtures/single-byte', `index-${encoding}.txt`),
         'utf8'
@@ -199,6 +206,7 @@ describe('single-byte encodings index: WHATWG', () => {
           t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte)))
 
           t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
+          t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
         } else {
           t.assert.throws(() => decoder(Uint8Array.of(byte)))
           try {
@@ -230,6 +238,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () =>
       const decoder = createSinglebyteDecoder(encoding)
       const decoderLoose = createSinglebyteDecoder(encoding, true)
       const encoder = createSinglebyteEncoder(encoding)
+      const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
 
       t.assert.strictEqual(data.length, 128)
       for (let i = 0; i < data.length; i++) {
@@ -244,6 +253,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () =>
           t.assert.strictEqual(decoder(Uint8Array.of(byte)), str)
           t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), str)
           t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
+          t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
         } else {
           t.assert.throws(() => decoder(Uint8Array.of(byte)))
           t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), '\uFFFD')
@@ -268,13 +278,16 @@ describe('x-user-defined', () => {
 
   test('encode', (t) => {
     const encoder = createSinglebyteEncoder(encoding)
+    const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
     for (let byte = 0; byte < 256; byte++) {
       const str = String.fromCodePoint(byte >= 0x80 ? 0xf7_80 + byte - 0x80 : byte)
       t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte), byte)
+      t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte), byte)
     }
 
     for (let i = 128; i < 512; i++) {
       t.assert.throws(() => encoder(String.fromCodePoint(i)), /Input is not well-formed/)
+      t.assert.deepStrictEqual(encoderLoose(String.fromCodePoint(i)), Uint8Array.of(0x3f), i)
     }
   })
 })
@@ -284,21 +297,31 @@ describe('codes above 0x7F are non-ASCII', () => {
   for (const encoding of ['iso-8859-2', 'iso-8859-16']) {
     test(encoding, (t) => {
       const encoder = createSinglebyteEncoder(encoding)
+      const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
       t.assert.deepStrictEqual(encoder('\x80'), new Uint8Array(1).fill(0x80))
       t.assert.deepStrictEqual(encoder('\x80'.repeat(4)), new Uint8Array(4).fill(0x80))
       t.assert.deepStrictEqual(encoder('\x80'.repeat(8)), new Uint8Array(8).fill(0x80))
       t.assert.deepStrictEqual(encoder('\x80'.repeat(16)), new Uint8Array(16).fill(0x80))
+      t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x80))
+      t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x80))
+      t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x80))
+      t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x80))
     })
   }
 
   // 0x80 maps to something else
   for (const encoding of ['windows-1250', 'windows-1252', 'x-user-defined']) {
     test(encoding, (t) => {
       const encoder = createSinglebyteEncoder(encoding)
+      const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
       t.assert.throws(() => encoder('\x80'))
       t.assert.throws(() => encoder('\x80'.repeat(4)))
       t.assert.throws(() => encoder('\x80'.repeat(8)))
       t.assert.throws(() => encoder('\x80'.repeat(16)))
+      t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x3f))
+      t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x3f))
+      t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x3f))
+      t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x3f))
     })
   }
 })