perf: UTF-8 fast path, pre-allocated output, flattened binary detection

j-mendez · claude · j-mendez · commit fe9b84062078 · 2026-03-29T08:56:08.000-04:00
- Add UTF-8 fast path in decode_to_string: skip decoder loop entirely
  when encoding is UTF-8 and input validates (common case for modern web)
- Pre-allocate output String with html.len() capacity to avoid reallocs
- Increase decode buffer from 2048 to 8192 for large (&gt;15KB) documents,
  reducing decode loop iterations by 4x
- Flatten is_binary_file: replace double PHF lookup (first byte -&gt; string
  key -&gt; magic bytes) with sorted static table + binary search on first
  byte, eliminating string hashing entirely
- Keep PHF-based is_binary_file_phf for backwards compat, add parity test
- Add UTF-8 fast path tests (small + 20KB large)

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "auto_encoder"
-version = "0.2.0"
+version = "0.2.1"
 edition = "2021"
 description = "Auto encoding library"
 repository = "https://github.com/spider-rs/auto-encoder"
diff --git a/src/detect.rs b/src/detect.rs
@@ -1,11 +1,54 @@
 use crate::meta::{HtmlMetadata, ASSET_NUMBERS, FIRST_BYTE_MAP};
 
+/// Magic byte signatures grouped by first byte for single-pass matching.
+/// Sorted longest-first within each group so longer signatures match before shorter prefixes.
+static MAGIC_TABLE: &[(u8, &[&[u8]])] = &[
+    (0x00, &[&[0x00, 0x00, 0x01, 0xBA], &[0x00, 0x00, 0x01, 0xB3], &[0x00, 0x00, 0x00, 0x18]]),
+    (0x1A, &[&[0x1A, 0x45, 0xDF, 0xA3]]),
+    (0x1F, &[&[0x1F, 0x8B]]),
+    (0x25, &[b"%PDF"]),
+    (0x42, &[&[0x42, 0x5A, 0x68], &[0x42, 0x4D]]),
+    (0x46, &[&[0x46, 0x4C, 0x56, 0x01]]),
+    (0x47, &[&[0x47, 0x49, 0x46, 0x38]]),
+    (0x49, &[&[0x49, 0x49, 0x2A, 0x00], &[0x49, 0x49, 0x2B, 0x00], &[0x49, 0x44, 0x33]]),
+    (0x4C, &[&[0x4C]]),
+    (0x4D, &[&[0x4D, 0x4D, 0x00, 0x2A], &[0x4D, 0x4D, 0x00, 0x2B]]),
+    (0x4F, &[&[0x4F, 0x67, 0x67, 0x53]]),
+    (0x50, &[&[0x50, 0x4B, 0x03, 0x04]]),
+    (0x52, &[&[0x52, 0x49, 0x46, 0x46]]),
+    (0x66, &[&[0x66, 0x4C, 0x61, 0x43]]),
+    (0x7F, &[&[0x7F, 0x45, 0x4C, 0x46]]),
+    (0x89, &[&[0x89, 0x50, 0x4E, 0x47]]),
+    (0xCA, &[&[0xCA, 0xFE, 0xBA, 0xBE]]),
+    (0xFF, &[&[0xFF, 0xD8, 0xFF], &[0xFF, 0xFB]]),
+];
+
 /// Checks if the file is a known binary format using its initial bytes.
 #[inline]
 pub fn is_binary_file(content: &[u8]) -> bool {
     if content.is_empty() {
         return false;
     }
+    let first = content[0];
+    // Binary search on sorted first-byte table
+    if let Ok(idx) = MAGIC_TABLE.binary_search_by_key(&first, |&(b, _)| b) {
+        let (_, signatures) = MAGIC_TABLE[idx];
+        for sig in signatures.iter() {
+            if content.len() >= sig.len() && &content[..sig.len()] == *sig {
+                return true;
+            }
+        }
+    }
+    false
+}
+
+/// Checks if the file is a known binary format using its initial bytes.
+/// Uses the original PHF map implementation for backwards compatibility.
+#[inline]
+pub fn is_binary_file_phf(content: &[u8]) -> bool {
+    if content.is_empty() {
+        return false;
+    }
 
     if let Some(&keys) = FIRST_BYTE_MAP.get(&content[0]) {
         for &key in keys {
@@ -75,7 +118,10 @@ pub fn detect_encoding(html_content: &[u8]) -> Option<String> {
 
     while pos < search_area.len() {
         let remaining = &search_area[pos..];
-        let meta_start = find_subsequence(remaining, b"<meta")?;
+        let meta_start = match find_subsequence(remaining, b"<meta") {
+            Some(s) => s,
+            None => break,
+        };
         let meta_content = &remaining[meta_start..];
         pos += meta_start + 5;
 
diff --git a/src/lib.rs b/src/lib.rs
@@ -78,12 +78,21 @@ pub fn encoding_for_locale(locale: &str) -> Option<&'static encoding_rs::Encodin
 }
 
 fn decode_to_string(html: &[u8], encoding: &'static encoding_rs::Encoding) -> String {
+    // Fast path: UTF-8 is the common case for modern web pages.
+    // Skip the decoder loop entirely when input is valid UTF-8.
+    if encoding == encoding_rs::UTF_8 {
+        if let Ok(s) = std::str::from_utf8(html) {
+            return s.to_owned();
+        }
+        // Invalid UTF-8: fall through to decoder which handles replacement
+    }
+
     let mut decoder = encoding.new_decoder();
     let mut total_read = 0usize;
+    let mut output = String::with_capacity(html.len());
 
     let mut process = |buffer: &mut str| {
         let mut bytes_in_buffer = 0usize;
-        let mut output = String::new();
 
         loop {
             let (result, read, written, _) = decoder.decode_to_str(
@@ -113,24 +122,24 @@ fn decode_to_string(html: &[u8], encoding: &'static encoding_rs::Encoding) -> St
                 CoderResult::OutputFull => continue,
             }
         }
-
-        output
     };
 
     match html.len() {
         15001..=usize::MAX => {
-            let mut buf = [0u8; 2048];
+            let mut buf = [0u8; 8192];
             process(std::str::from_utf8_mut(&mut buf[..]).unwrap_or_default())
         }
         1000..=15000 => {
-            let mut buf = [0u8; 1024];
+            let mut buf = [0u8; 2048];
             process(std::str::from_utf8_mut(&mut buf[..]).unwrap_or_default())
         }
         _ => {
             let mut buf = [0u8; 512];
             process(std::str::from_utf8_mut(&mut buf[..]).unwrap_or_default())
         }
     }
+
+    output
 }
 
 /// Get the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS.
@@ -381,6 +390,51 @@ mod tests {
         assert!(!is_binary_file(&[]));
     }
 
+    #[test]
+    fn test_utf8_fast_path() {
+        let html = "Hello, world! Some UTF-8 content: \u{00e9}\u{00e8}\u{00ea}";
+        let result = encode_bytes(html.as_bytes(), "utf-8");
+        assert_eq!(result, html);
+    }
+
+    #[test]
+    fn test_utf8_fast_path_large() {
+        let html = "x".repeat(20000);
+        let result = encode_bytes(html.as_bytes(), "utf-8");
+        assert_eq!(result, html);
+    }
+
+    #[test]
+    fn test_binary_detection_parity() {
+        // Verify flattened MAGIC_TABLE matches PHF-based detection
+        let test_cases: &[&[u8]] = &[
+            &[0xFF, 0xD8, 0xFF],       // JPEG
+            &[0x89, 0x50, 0x4E, 0x47], // PNG
+            &[0x47, 0x49, 0x46, 0x38], // GIF
+            &[0x42, 0x5A, 0x68],       // BZip
+            &[0x42, 0x4D, 0x00],       // BMP
+            &[0x50, 0x4B, 0x03, 0x04], // ZIP
+            &[0x1F, 0x8B],             // GZIP
+            &[0x7F, 0x45, 0x4C, 0x46], // ELF
+            &[0xCA, 0xFE, 0xBA, 0xBE], // Java class
+            &[0x25, 0x50, 0x44, 0x46], // PDF
+            &[0x00, 0x00, 0x01, 0xBA], // MPEG
+            &[0xFF, 0xFB],             // MP3
+            &[0x49, 0x44, 0x33],       // MP3 ID3
+            &[0x00, 0x00, 0x00, 0x00], // No match
+            &[0x01, 0x02, 0x03],       // No match
+            &[],                        // Empty
+        ];
+        for case in test_cases {
+            assert_eq!(
+                is_binary_file(case),
+                detect::is_binary_file_phf(case),
+                "Mismatch for {:?}",
+                case
+            );
+        }
+    }
+
     #[ignore]
     #[test]
     fn test_detect_encoding() {