Skip to content

Commit fe9b840

Browse files
j-mendezclaude
andcommitted
perf: UTF-8 fast path, pre-allocated output, flattened binary detection
- Add UTF-8 fast path in decode_to_string: skip decoder loop entirely when encoding is UTF-8 and input validates (common case for modern web) - Pre-allocate output String with html.len() capacity to avoid reallocs - Increase decode buffer from 2048 to 8192 for large (>15KB) documents, reducing decode loop iterations by 4x - Flatten is_binary_file: replace double PHF lookup (first byte -> string key -> magic bytes) with sorted static table + binary search on first byte, eliminating string hashing entirely - Keep PHF-based is_binary_file_phf for backwards compat, add parity test - Add UTF-8 fast path tests (small + 20KB large) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 2151f53 commit fe9b840

3 files changed

Lines changed: 107 additions & 7 deletions

File tree

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "auto_encoder"
3-
version = "0.2.0"
3+
version = "0.2.1"
44
edition = "2021"
55
description = "Auto encoding library"
66
repository = "https://github.com/spider-rs/auto-encoder"

src/detect.rs

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,54 @@
11
use crate::meta::{HtmlMetadata, ASSET_NUMBERS, FIRST_BYTE_MAP};
22

3+
/// Magic byte signatures grouped by first byte for single-pass matching.
4+
/// Sorted longest-first within each group so longer signatures match before shorter prefixes.
5+
static MAGIC_TABLE: &[(u8, &[&[u8]])] = &[
6+
(0x00, &[&[0x00, 0x00, 0x01, 0xBA], &[0x00, 0x00, 0x01, 0xB3], &[0x00, 0x00, 0x00, 0x18]]),
7+
(0x1A, &[&[0x1A, 0x45, 0xDF, 0xA3]]),
8+
(0x1F, &[&[0x1F, 0x8B]]),
9+
(0x25, &[b"%PDF"]),
10+
(0x42, &[&[0x42, 0x5A, 0x68], &[0x42, 0x4D]]),
11+
(0x46, &[&[0x46, 0x4C, 0x56, 0x01]]),
12+
(0x47, &[&[0x47, 0x49, 0x46, 0x38]]),
13+
(0x49, &[&[0x49, 0x49, 0x2A, 0x00], &[0x49, 0x49, 0x2B, 0x00], &[0x49, 0x44, 0x33]]),
14+
(0x4C, &[&[0x4C]]),
15+
(0x4D, &[&[0x4D, 0x4D, 0x00, 0x2A], &[0x4D, 0x4D, 0x00, 0x2B]]),
16+
(0x4F, &[&[0x4F, 0x67, 0x67, 0x53]]),
17+
(0x50, &[&[0x50, 0x4B, 0x03, 0x04]]),
18+
(0x52, &[&[0x52, 0x49, 0x46, 0x46]]),
19+
(0x66, &[&[0x66, 0x4C, 0x61, 0x43]]),
20+
(0x7F, &[&[0x7F, 0x45, 0x4C, 0x46]]),
21+
(0x89, &[&[0x89, 0x50, 0x4E, 0x47]]),
22+
(0xCA, &[&[0xCA, 0xFE, 0xBA, 0xBE]]),
23+
(0xFF, &[&[0xFF, 0xD8, 0xFF], &[0xFF, 0xFB]]),
24+
];
25+
326
/// Checks if the file is a known binary format using its initial bytes.
427
#[inline]
528
pub fn is_binary_file(content: &[u8]) -> bool {
629
if content.is_empty() {
730
return false;
831
}
32+
let first = content[0];
33+
// Binary search on sorted first-byte table
34+
if let Ok(idx) = MAGIC_TABLE.binary_search_by_key(&first, |&(b, _)| b) {
35+
let (_, signatures) = MAGIC_TABLE[idx];
36+
for sig in signatures.iter() {
37+
if content.len() >= sig.len() && &content[..sig.len()] == *sig {
38+
return true;
39+
}
40+
}
41+
}
42+
false
43+
}
44+
45+
/// Checks if the file is a known binary format using its initial bytes.
46+
/// Uses the original PHF map implementation for backwards compatibility.
47+
#[inline]
48+
pub fn is_binary_file_phf(content: &[u8]) -> bool {
49+
if content.is_empty() {
50+
return false;
51+
}
952

1053
if let Some(&keys) = FIRST_BYTE_MAP.get(&content[0]) {
1154
for &key in keys {
@@ -75,7 +118,10 @@ pub fn detect_encoding(html_content: &[u8]) -> Option<String> {
75118

76119
while pos < search_area.len() {
77120
let remaining = &search_area[pos..];
78-
let meta_start = find_subsequence(remaining, b"<meta")?;
121+
let meta_start = match find_subsequence(remaining, b"<meta") {
122+
Some(s) => s,
123+
None => break,
124+
};
79125
let meta_content = &remaining[meta_start..];
80126
pos += meta_start + 5;
81127

src/lib.rs

Lines changed: 59 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,21 @@ pub fn encoding_for_locale(locale: &str) -> Option<&'static encoding_rs::Encodin
7878
}
7979

8080
fn decode_to_string(html: &[u8], encoding: &'static encoding_rs::Encoding) -> String {
81+
// Fast path: UTF-8 is the common case for modern web pages.
82+
// Skip the decoder loop entirely when input is valid UTF-8.
83+
if encoding == encoding_rs::UTF_8 {
84+
if let Ok(s) = std::str::from_utf8(html) {
85+
return s.to_owned();
86+
}
87+
// Invalid UTF-8: fall through to decoder which handles replacement
88+
}
89+
8190
let mut decoder = encoding.new_decoder();
8291
let mut total_read = 0usize;
92+
let mut output = String::with_capacity(html.len());
8393

8494
let mut process = |buffer: &mut str| {
8595
let mut bytes_in_buffer = 0usize;
86-
let mut output = String::new();
8796

8897
loop {
8998
let (result, read, written, _) = decoder.decode_to_str(
@@ -113,24 +122,24 @@ fn decode_to_string(html: &[u8], encoding: &'static encoding_rs::Encoding) -> St
113122
CoderResult::OutputFull => continue,
114123
}
115124
}
116-
117-
output
118125
};
119126

120127
match html.len() {
121128
15001..=usize::MAX => {
122-
let mut buf = [0u8; 2048];
129+
let mut buf = [0u8; 8192];
123130
process(std::str::from_utf8_mut(&mut buf[..]).unwrap_or_default())
124131
}
125132
1000..=15000 => {
126-
let mut buf = [0u8; 1024];
133+
let mut buf = [0u8; 2048];
127134
process(std::str::from_utf8_mut(&mut buf[..]).unwrap_or_default())
128135
}
129136
_ => {
130137
let mut buf = [0u8; 512];
131138
process(std::str::from_utf8_mut(&mut buf[..]).unwrap_or_default())
132139
}
133140
}
141+
142+
output
134143
}
135144

136145
/// Get the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS.
@@ -381,6 +390,51 @@ mod tests {
381390
assert!(!is_binary_file(&[]));
382391
}
383392

393+
#[test]
394+
fn test_utf8_fast_path() {
395+
let html = "Hello, world! Some UTF-8 content: \u{00e9}\u{00e8}\u{00ea}";
396+
let result = encode_bytes(html.as_bytes(), "utf-8");
397+
assert_eq!(result, html);
398+
}
399+
400+
#[test]
401+
fn test_utf8_fast_path_large() {
402+
let html = "x".repeat(20000);
403+
let result = encode_bytes(html.as_bytes(), "utf-8");
404+
assert_eq!(result, html);
405+
}
406+
407+
#[test]
408+
fn test_binary_detection_parity() {
409+
// Verify flattened MAGIC_TABLE matches PHF-based detection
410+
let test_cases: &[&[u8]] = &[
411+
&[0xFF, 0xD8, 0xFF], // JPEG
412+
&[0x89, 0x50, 0x4E, 0x47], // PNG
413+
&[0x47, 0x49, 0x46, 0x38], // GIF
414+
&[0x42, 0x5A, 0x68], // BZip
415+
&[0x42, 0x4D, 0x00], // BMP
416+
&[0x50, 0x4B, 0x03, 0x04], // ZIP
417+
&[0x1F, 0x8B], // GZIP
418+
&[0x7F, 0x45, 0x4C, 0x46], // ELF
419+
&[0xCA, 0xFE, 0xBA, 0xBE], // Java class
420+
&[0x25, 0x50, 0x44, 0x46], // PDF
421+
&[0x00, 0x00, 0x01, 0xBA], // MPEG
422+
&[0xFF, 0xFB], // MP3
423+
&[0x49, 0x44, 0x33], // MP3 ID3
424+
&[0x00, 0x00, 0x00, 0x00], // No match
425+
&[0x01, 0x02, 0x03], // No match
426+
&[], // Empty
427+
];
428+
for case in test_cases {
429+
assert_eq!(
430+
is_binary_file(case),
431+
detect::is_binary_file_phf(case),
432+
"Mismatch for {:?}",
433+
case
434+
);
435+
}
436+
}
437+
384438
#[ignore]
385439
#[test]
386440
fn test_detect_encoding() {

0 commit comments

Comments
 (0)