Skip to content

Commit 312746b

Browse files
committed
feat: add replacement support in single-byte encoders
1 parent 44bb79c commit 312746b

File tree

4 files changed

+102
-34
lines changed

4 files changed

+102
-34
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,8 @@ Returns a function `encode(string)` that encodes a string to bytes.
226226
In `'fatal'` mode (default), will throw on non well-formed strings or any codepoints which could
227227
not be encoded in the target encoding.
228228

229+
In `'replacement'` mode, all unmapped codepoints and unpaired surrogates will be replaced with `U+3F` (codepoint for '?').
230+
229231
#### `latin1toString(arr)`
230232

231233
Decode `iso-8859-1` bytes to a string.

single-byte.js

Lines changed: 36 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -61,38 +61,59 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
const x = new Uint8Array(len)
6767
let i = nativeEncoder ? 0 : encodeAsciiPrefix(x, s)
6868

69-
for (const len3 = len - 3; i < len3; i += 4) {
69+
if (!m || m.length < 256) return null // perf
70+
const len3 = len - 3
71+
while (i < len3) {
7072
const x0 = s.charCodeAt(i), x1 = s.charCodeAt(i + 1), x2 = s.charCodeAt(i + 2), x3 = s.charCodeAt(i + 3) // prettier-ignore
7173
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
72-
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) return null
74+
if ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3)) break
7375

7476
x[i] = c0
7577
x[i + 1] = c1
7678
x[i + 2] = c2
7779
x[i + 3] = c3
80+
i += 4
7881
}
7982

8083
for (; i < len; i++) {
8184
const x0 = s.charCodeAt(i)
8285
const c0 = m[x0]
83-
if (!c0 && x0) return null
86+
if (!c0 && x0) break
8487
x[i] = c0
8588
}
8689

87-
return x
90+
if (i === len) return x
91+
if (!loose) return null
92+
let j = i
93+
while (i < len) {
94+
const x0 = s.charCodeAt(i++)
95+
if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
96+
if (i < len) {
97+
const x1 = s.charCodeAt(i)
98+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
99+
}
100+
x[j++] = 63 // '?'
101+
} else {
102+
const c0 = m[x0]
103+
x[j++] = !c0 && x0 ? 63 : c0
104+
}
105+
106+
}
107+
108+
return j === len ? x : x.subarray(0, j)
88109
}
89110

90111
// fromBase64+btoa path is faster on everything where fromBase64 is fast
91112
const useLatin1btoa = Uint8Array.fromBase64 && btoa && !skipWeb
92113

93114
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
94-
// TODO: replacement, truncate (replacement will need varying length)
95-
if (mode !== 'fatal') throw new Error('Unsupported mode')
115+
const loose = mode === 'replacement'
116+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
96117
const m = encodeMap(encoding) // asserts
97118
const isLatin1 = encoding === 'iso-8859-1'
98119

@@ -106,24 +127,21 @@ export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
106127
if (useLatin1btoa && s.length >= 1024 && s.length < 1e8) {
107128
try {
108129
return Uint8Array.fromBase64(btoa(s)) // fails on non-latin1
109-
} catch {
110-
throw new TypeError(E_STRICT)
111-
}
130+
} catch {}
131+
} else if (!NON_LATIN.test(s)) {
132+
return encodeLatin1(s)
112133
}
113134

114-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
115-
return encodeLatin1(s)
116-
}
117-
118-
// Instead of an ASCII regex check, encode optimistically - this is faster
119-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
120-
if (nativeEncoder && !NON_LATIN.test(s)) {
135+
if (!loose) throw new TypeError(E_STRICT)
136+
} else if (nativeEncoder && !NON_LATIN.test(s)) {
137+
// Instead of an ASCII regex check, encode optimistically - this is faster
138+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
121139
try {
122140
return encodeAscii(s, E_STRICT)
123141
} catch {}
124142
}
125143

126-
const res = encode(s, m)
144+
const res = encode(s, m, loose)
127145
if (!res) throw new TypeError(E_STRICT)
128146
return res
129147
}

single-byte.node.js

Lines changed: 41 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -61,54 +61,79 @@ export function createSinglebyteDecoder(encoding, loose = false) {
6161

6262
const NON_LATIN = /[^\x00-\xFF]/ // eslint-disable-line no-control-regex
6363

64-
function encode(s, m) {
64+
function encode(s, m, loose) {
6565
const len = s.length
6666
let i = 0
6767
const b = Buffer.from(s, 'utf-16le') // aligned
6868
if (!isLE) b.swap16()
6969
const x = new Uint16Array(b.buffer, b.byteOffset, b.byteLength / 2)
70-
for (const len3 = len - 3; i < len3; i += 4) {
70+
if (!m || m.length < 256) return null // perf
71+
const len3 = len - 3
72+
while (i < len3) {
7173
const x0 = x[i], x1 = x[i + 1], x2 = x[i + 2], x3 = x[i + 3] // prettier-ignore
7274
const c0 = m[x0], c1 = m[x1], c2 = m[x2], c3 = m[x3] // prettier-ignore
73-
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) return null // prettier-ignore
75+
if (!(c0 && c1 && c2 && c3) && ((!c0 && x0) || (!c1 && x1) || (!c2 && x2) || (!c3 && x3))) break
7476
x[i] = c0
7577
x[i + 1] = c1
7678
x[i + 2] = c2
7779
x[i + 3] = c3
80+
i += 4
7881
}
7982

83+
const mlen = m.length
8084
for (; i < len; i++) {
8185
const x0 = x[i]
86+
if (x0 >= mlen) break
8287
const c0 = m[x0]
83-
if (!c0 && x0) return null
88+
if (!c0 && x0) break
8489
x[i] = c0
8590
}
8691

87-
return new Uint8Array(x)
92+
if (i === len) return new Uint8Array(x)
93+
if (!loose) return null
94+
let j = i
95+
while (i < len) {
96+
const x0 = x[i++]
97+
if (x0 >= 0xd8_00 && x0 < 0xdc_00) {
98+
if (i < len) {
99+
const x1 = x[i]
100+
if (x1 >= 0xdc_00 && x1 < 0xe0_00) i++
101+
}
102+
x[j++] = 63 // '?'
103+
} else if (x0 >= mlen) {
104+
x[j++] = 63 // '?'
105+
} else {
106+
const c0 = m[x0]
107+
x[j++] = !c0 && x0 ? 63 : c0
108+
}
109+
}
110+
111+
return new Uint8Array(j === len ? x : x.subarray(0, j))
88112
}
89113

90114
export function createSinglebyteEncoder(encoding, { mode = 'fatal' } = {}) {
91-
// TODO: replacement, truncate (replacement will need varying length)
92-
if (mode !== 'fatal') throw new Error('Unsupported mode')
115+
const loose = mode === 'replacement'
116+
if (mode !== 'fatal' && !loose) throw new Error('Unsupported mode')
93117
const m = encodeMap(encoding) // asserts
94118
const isLatin1 = encoding === 'iso-8859-1'
95119

96120
return (s) => {
97121
if (typeof s !== 'string') throw new TypeError(E_STRING)
98122
if (isLatin1) {
99-
if (NON_LATIN.test(s)) throw new TypeError(E_STRICT)
100-
const b = Buffer.from(s, 'latin1')
101-
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
102-
}
103-
104-
// Instead of an ASCII regex check, encode optimistically - this is faster
105-
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
106-
if (!NON_LATIN.test(s)) {
123+
if (!NON_LATIN.test(s)) {
124+
const b = Buffer.from(s, 'latin1')
125+
return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
126+
}
127+
128+
if (!loose) throw new TypeError(E_STRICT)
129+
} else if (!NON_LATIN.test(s)) {
130+
// Instead of an ASCII regex check, encode optimistically - this is faster
131+
// Check for 8-bit string with a regex though, this is instant on 8-bit strings so doesn't hurt the ASCII fast path
107132
const b = Buffer.from(s, 'utf8') // ascii/latin1 coerces, we need to check
108133
if (b.length === s.length) return new Uint8Array(b.buffer, b.byteOffset, b.byteLength)
109134
}
110135

111-
const res = encode(s, m)
136+
const res = encode(s, m, loose)
112137
if (!res) throw new TypeError(E_STRICT)
113138
return res
114139
}

tests/single-byte.test.js

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ describe('single-byte encodings are supersets of ascii', () => {
1515
for (const encoding of encodings) {
1616
test(encoding, (t) => {
1717
const decoder = createSinglebyteDecoder(encoding)
18+
const decoderLoose = createSinglebyteDecoder(encoding, true)
1819
const encoder = createSinglebyteEncoder(encoding)
20+
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
1921
for (let i = 0; i < 128; i++) {
2022
let str
2123
try {
@@ -27,7 +29,9 @@ describe('single-byte encodings are supersets of ascii', () => {
2729
t.assert.strictEqual(str.length, 1, i)
2830
t.assert.strictEqual(str.codePointAt(0), i, i)
2931

32+
t.assert.strictEqual(decoderLoose(Uint8Array.of(i)), str, i)
3033
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(i))
34+
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(i))
3135
}
3236
})
3337
}
@@ -84,6 +88,7 @@ describe('single-byte encodings index: Unicode', () => {
8488
const decoder = createSinglebyteDecoder(encoding)
8589
const decoderLoose = createSinglebyteDecoder(encoding, true)
8690
const encoder = createSinglebyteEncoder(encoding)
91+
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
8792
const text = readFileSync(
8893
join(import.meta.dirname, 'encoding/fixtures/unicode/', fileName),
8994
'utf8'
@@ -145,6 +150,7 @@ describe('single-byte encodings index: Unicode', () => {
145150
t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte)))
146151

147152
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
153+
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
148154
}
149155
}
150156
})
@@ -158,6 +164,7 @@ describe('single-byte encodings index: WHATWG', () => {
158164
const decoder = createSinglebyteDecoder(encoding)
159165
const decoderLoose = createSinglebyteDecoder(encoding, true)
160166
const encoder = createSinglebyteEncoder(encoding)
167+
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
161168
const text = readFileSync(
162169
join(import.meta.dirname, 'encoding/fixtures/single-byte', `index-${encoding}.txt`),
163170
'utf8'
@@ -199,6 +206,7 @@ describe('single-byte encodings index: WHATWG', () => {
199206
t.assert.strictEqual(str, decoderLoose(Uint8Array.of(byte)))
200207

201208
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
209+
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
202210
} else {
203211
t.assert.throws(() => decoder(Uint8Array.of(byte)))
204212
try {
@@ -230,6 +238,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () =>
230238
const decoder = createSinglebyteDecoder(encoding)
231239
const decoderLoose = createSinglebyteDecoder(encoding, true)
232240
const encoder = createSinglebyteEncoder(encoding)
241+
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
233242

234243
t.assert.strictEqual(data.length, 128)
235244
for (let i = 0; i < data.length; i++) {
@@ -244,6 +253,7 @@ describe('single-byte encodings index: WHATWG non-normative indexes.json', () =>
244253
t.assert.strictEqual(decoder(Uint8Array.of(byte)), str)
245254
t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), str)
246255
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte))
256+
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte))
247257
} else {
248258
t.assert.throws(() => decoder(Uint8Array.of(byte)))
249259
t.assert.strictEqual(decoderLoose(Uint8Array.of(byte)), '\uFFFD')
@@ -268,13 +278,16 @@ describe('x-user-defined', () => {
268278

269279
test('encode', (t) => {
270280
const encoder = createSinglebyteEncoder(encoding)
281+
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
271282
for (let byte = 0; byte < 256; byte++) {
272283
const str = String.fromCodePoint(byte >= 0x80 ? 0xf7_80 + byte - 0x80 : byte)
273284
t.assert.deepStrictEqual(encoder(str), Uint8Array.of(byte), byte)
285+
t.assert.deepStrictEqual(encoderLoose(str), Uint8Array.of(byte), byte)
274286
}
275287

276288
for (let i = 128; i < 512; i++) {
277289
t.assert.throws(() => encoder(String.fromCodePoint(i)), /Input is not well-formed/)
290+
t.assert.deepStrictEqual(encoderLoose(String.fromCodePoint(i)), Uint8Array.of(0x3f), i)
278291
}
279292
})
280293
})
@@ -284,21 +297,31 @@ describe('codes above 0x7F are non-ASCII', () => {
284297
for (const encoding of ['iso-8859-2', 'iso-8859-16']) {
285298
test(encoding, (t) => {
286299
const encoder = createSinglebyteEncoder(encoding)
300+
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
287301
t.assert.deepStrictEqual(encoder('\x80'), new Uint8Array(1).fill(0x80))
288302
t.assert.deepStrictEqual(encoder('\x80'.repeat(4)), new Uint8Array(4).fill(0x80))
289303
t.assert.deepStrictEqual(encoder('\x80'.repeat(8)), new Uint8Array(8).fill(0x80))
290304
t.assert.deepStrictEqual(encoder('\x80'.repeat(16)), new Uint8Array(16).fill(0x80))
305+
t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x80))
306+
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x80))
307+
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x80))
308+
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x80))
291309
})
292310
}
293311

294312
// 0x80 maps to something else
295313
for (const encoding of ['windows-1250', 'windows-1252', 'x-user-defined']) {
296314
test(encoding, (t) => {
297315
const encoder = createSinglebyteEncoder(encoding)
316+
const encoderLoose = createSinglebyteEncoder(encoding, { mode: 'replacement' })
298317
t.assert.throws(() => encoder('\x80'))
299318
t.assert.throws(() => encoder('\x80'.repeat(4)))
300319
t.assert.throws(() => encoder('\x80'.repeat(8)))
301320
t.assert.throws(() => encoder('\x80'.repeat(16)))
321+
t.assert.deepStrictEqual(encoderLoose('\x80'), new Uint8Array(1).fill(0x3f))
322+
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(4)), new Uint8Array(4).fill(0x3f))
323+
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(8)), new Uint8Array(8).fill(0x3f))
324+
t.assert.deepStrictEqual(encoderLoose('\x80'.repeat(16)), new Uint8Array(16).fill(0x3f))
302325
})
303326
}
304327
})

0 commit comments

Comments
 (0)