Skip to content

Commit fdf4cab

Browse files
j-mendezclaude
andcommitted
fix: remove UTF-8 fast path that could bypass encoding conversion
The UTF-8 fast path (from_utf8 + to_owned) skipped the encoding_rs decoder when input validated as UTF-8. This is unsafe for the crate's purpose: bytes detected as a non-UTF-8 encoding might partially validate as UTF-8 but need proper re-encoding. The decoder also handles replacement characters for invalid sequences, which the fast path skipped. Keeps: pre-allocated output, larger decode buffers, flattened binary detection, SIMD search via memchr. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent df28fab commit fdf4cab

2 files changed

Lines changed: 1 addition & 24 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "auto_encoder"
3-
version = "0.2.1"
3+
version = "0.2.2"
44
edition = "2021"
55
description = "Auto encoding library"
66
repository = "https://github.com/spider-rs/auto-encoder"

src/lib.rs

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -78,15 +78,6 @@ pub fn encoding_for_locale(locale: &str) -> Option<&'static encoding_rs::Encodin
7878
}
7979

8080
fn decode_to_string(html: &[u8], encoding: &'static encoding_rs::Encoding) -> String {
81-
// Fast path: UTF-8 is the common case for modern web pages.
82-
// Skip the decoder loop entirely when input is valid UTF-8.
83-
if encoding == encoding_rs::UTF_8 {
84-
if let Ok(s) = std::str::from_utf8(html) {
85-
return s.to_owned();
86-
}
87-
// Invalid UTF-8: fall through to decoder which handles replacement
88-
}
89-
9081
let mut decoder = encoding.new_decoder();
9182
let mut total_read = 0usize;
9283
let mut output = String::with_capacity(html.len());
@@ -390,20 +381,6 @@ mod tests {
390381
assert!(!is_binary_file(&[]));
391382
}
392383

393-
#[test]
394-
fn test_utf8_fast_path() {
395-
let html = "Hello, world! Some UTF-8 content: \u{00e9}\u{00e8}\u{00ea}";
396-
let result = encode_bytes(html.as_bytes(), "utf-8");
397-
assert_eq!(result, html);
398-
}
399-
400-
#[test]
401-
fn test_utf8_fast_path_large() {
402-
let html = "x".repeat(20000);
403-
let result = encode_bytes(html.as_bytes(), "utf-8");
404-
assert_eq!(result, html);
405-
}
406-
407384
#[test]
408385
fn test_binary_detection_parity() {
409386
// Verify flattened MAGIC_TABLE matches PHF-based detection

0 commit comments

Comments
 (0)