Describe what you are looking for
Summary
sz_find_neon for needle lengths 2-4 on aarch64 is roughly 2x slower than glibc 2.28's NEON-accelerated memmem for full-haystack scans (not-found case).
Benchmark evidence
aarch64 (Graviton-class, 16 cores), 4096-byte haystack, Stringzilla v4.6.0 vs glibc 2.28 memmem.
Not-found (full haystack scan)
"Sparse" = URL-like text, needle bytes don't appear. "Dense" = all 'a', worst case for partial matches.
| Needle |
glibc 2.28 memmem |
Stringzilla NEON |
4x unrolled (below) |
| 2-byte sparse |
77 ns (~50 GB/s) |
162 ns (~24 GB/s) |
150 ns (~25 GB/s) |
| 2-byte dense |
85 ns (~45 GB/s) |
159 ns (~24 GB/s) |
150 ns (~25 GB/s) |
| 3-byte sparse |
77 ns |
227 ns |
153 ns |
| 3-byte dense |
85 ns |
227 ns |
— |
| 4-byte sparse |
77 ns |
294 ns |
152 ns |
| 4-byte dense |
90 ns |
294 ns |
— |
Found (early hit) — Stringzilla already dominates
| Needle |
glibc 2.28 memmem |
Stringzilla NEON |
| 2-byte |
~3.2 ns |
1.4 ns |
| 3-byte |
~3.4 ns |
1.5 ns |
| 8-byte |
22 ns |
7.1 ns |
| 16-byte |
40 ns |
6.0 ns |
| 32-byte |
168 ns |
6.4 ns |
| 64-byte |
236 ns |
7.6 ns |
Analysis
The current sz_find_neon 2-byte path processes 16 bytes per iteration with 2 loads (aligned + offset-by-1):
for (; h_length >= 17; h += 16, h_length -= 16) {
h_first_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 0));
h_last_vec.u8x16 = vld1q_u8((sz_u8_t const *)(h + 1));
matches_vec.u8x16 = vandq_u8(
vceqq_u8(h_first_vec.u8x16, n_first_vec.u8x16),
vceqq_u8(h_last_vec.u8x16, n_last_vec.u8x16));
matches = sz_find_vreinterpretq_u8_u4_(matches_vec.u8x16);
if (matches) return h + sz_u64_ctz(matches) / 4;
}
A 4x unrolled version with deferred match checking improved 3-4 byte needles significantly (294→152 ns) but the 2-byte case only went from 162→150 ns. The persistent ~2x gap vs glibc 2.28 suggests micro-architectural advantages in glibc's implementation — it would be worth studying the glibc 2.28 aarch64 memmem implementation and its NEON-specific routines for instruction scheduling insights.
Prototype: 4x unrolled NEON with deferred match extraction
This version processes 64 bytes per iteration with 8 overlapping loads and a single combined branch for the not-found fast path:
// 4x unrolled NEON loop with deferred match extraction.
sz_cptr_t sz_find_neon_unrolled(sz_cptr_t h, sz_size_t h_length,
sz_cptr_t n, sz_size_t n_length) {
if (h_length < n_length || n_length == 0) return SZ_NULL_CHAR;
if (n_length == 1) return sz_find_byte_neon(h, h_length, n);
uint8x16_t n0 = vdupq_n_u8((sz_u8_t)n[0]);
uint8x16_t n1 = vdupq_n_u8((sz_u8_t)n[1]);
// 4x unrolled: 64 bytes per iteration, 8 overlapping loads.
while (h_length >= 65) {
sz_u8_t const *p = (sz_u8_t const *)h;
uint8x16_t h0a = vld1q_u8(p);
uint8x16_t h1a = vld1q_u8(p + 1);
uint8x16_t ma = vandq_u8(vceqq_u8(h0a, n0), vceqq_u8(h1a, n1));
uint8x16_t h0b = vld1q_u8(p + 16);
uint8x16_t h1b = vld1q_u8(p + 17);
uint8x16_t mb = vandq_u8(vceqq_u8(h0b, n0), vceqq_u8(h1b, n1));
uint8x16_t h0c = vld1q_u8(p + 32);
uint8x16_t h1c = vld1q_u8(p + 33);
uint8x16_t mc = vandq_u8(vceqq_u8(h0c, n0), vceqq_u8(h1c, n1));
uint8x16_t h0d = vld1q_u8(p + 48);
uint8x16_t h1d = vld1q_u8(p + 49);
uint8x16_t md = vandq_u8(vceqq_u8(h0d, n0), vceqq_u8(h1d, n1));
// Single combined not-found check across all 4 vectors.
uint8x16_t any = vorrq_u8(vorrq_u8(ma, mb), vorrq_u8(mc, md));
if (vmaxvq_u8(any)) {
// Rare path: extract exact position.
sz_u64_t bits;
bits = sz_find_vreinterpretq_u8_u4_(ma);
if (bits) {
sz_size_t off = sz_u64_ctz(bits) / 4;
if (n_length <= 2 || sz_equal(h + off, n, n_length))
return h + off;
}
bits = sz_find_vreinterpretq_u8_u4_(mb);
if (bits) {
sz_size_t off = 16 + sz_u64_ctz(bits) / 4;
if (n_length <= 2 || sz_equal(h + off, n, n_length))
return h + off;
}
bits = sz_find_vreinterpretq_u8_u4_(mc);
if (bits) {
sz_size_t off = 32 + sz_u64_ctz(bits) / 4;
if (n_length <= 2 || sz_equal(h + off, n, n_length))
return h + off;
}
bits = sz_find_vreinterpretq_u8_u4_(md);
if (bits) {
sz_size_t off = 48 + sz_u64_ctz(bits) / 4;
if (n_length <= 2 || sz_equal(h + off, n, n_length))
return h + off;
}
}
h += 64;
h_length -= 64;
}
// Tail: single-vector loop.
while (h_length >= 17) {
uint8x16_t h0 = vld1q_u8((sz_u8_t const *)h);
uint8x16_t h1 = vld1q_u8((sz_u8_t const *)(h + 1));
uint8x16_t m = vandq_u8(vceqq_u8(h0, n0), vceqq_u8(h1, n1));
sz_u64_t bits = sz_find_vreinterpretq_u8_u4_(m);
if (bits) {
sz_size_t off = sz_u64_ctz(bits) / 4;
if (n_length <= 2 || sz_equal(h + off, n, n_length))
return h + off;
}
h += 16;
h_length -= 16;
}
return sz_find_serial(h, h_length, n, n_length);
}
Key changes:
- 4x unrolled: 64 bytes per iteration instead of 16
- Deferred match extraction:
vorrq_u8 + vmaxvq_u8 checks all 4 vectors with a single branch; the expensive bitmask extraction only runs on match
- Generalizes to any needle length >= 2 (first two bytes checked via NEON, remaining verified with
sz_equal)
This closes the gap for 3-4 byte needles (~2x improvement) but the 2-byte not-found case remains ~2x slower than glibc 2.28. The remaining gap likely requires studying the glibc 2.28 aarch64 NEON memmem implementation for instruction scheduling and micro-architectural optimizations beyond what C intrinsics can express.
Context
We're evaluating Stringzilla as a portable replacement for glibc memmem in a large C++ codebase. Stringzilla's consistency across environments is a major advantage — glibc memmem performance varies wildly between versions (77 ns on 2.28 vs 2,133 ns on 2.34 for the same 2-byte sparse workload on aarch64). Even at 150 ns, Stringzilla would be a 14x improvement over the glibc 2.34 memmem that production systems actually use. Closing the remaining 2x gap vs glibc 2.28 would make it strictly better in all scenarios.
Can you contribute to the implementation?
Is your feature request specific to a certain interface?
C implementation
Contact Details
No response
Is there an existing issue for this?
Code of Conduct
Describe what you are looking for
Summary
sz_find_neonfor needle lengths 2-4 on aarch64 is roughly 2x slower than glibc 2.28's NEON-acceleratedmemmemfor full-haystack scans (not-found case).Benchmark evidence
aarch64 (Graviton-class, 16 cores), 4096-byte haystack, Stringzilla v4.6.0 vs glibc 2.28 memmem.
Not-found (full haystack scan)
"Sparse" = URL-like text, needle bytes don't appear. "Dense" = all
'a', worst case for partial matches.Found (early hit) — Stringzilla already dominates
Analysis
The current
sz_find_neon2-byte path processes 16 bytes per iteration with 2 loads (aligned + offset-by-1):A 4x unrolled version with deferred match checking improved 3-4 byte needles significantly (294→152 ns) but the 2-byte case only went from 162→150 ns. The persistent ~2x gap vs glibc 2.28 suggests micro-architectural advantages in glibc's implementation — it would be worth studying the glibc 2.28 aarch64 memmem implementation and its NEON-specific routines for instruction scheduling insights.
Prototype: 4x unrolled NEON with deferred match extraction
This version processes 64 bytes per iteration with 8 overlapping loads and a single combined branch for the not-found fast path:
Key changes:
vorrq_u8+vmaxvq_u8checks all 4 vectors with a single branch; the expensive bitmask extraction only runs on matchsz_equal)This closes the gap for 3-4 byte needles (~2x improvement) but the 2-byte not-found case remains ~2x slower than glibc 2.28. The remaining gap likely requires studying the glibc 2.28 aarch64 NEON memmem implementation for instruction scheduling and micro-architectural optimizations beyond what C intrinsics can express.
Context
We're evaluating Stringzilla as a portable replacement for glibc
memmemin a large C++ codebase. Stringzilla's consistency across environments is a major advantage — glibc memmem performance varies wildly between versions (77 ns on 2.28 vs 2,133 ns on 2.34 for the same 2-byte sparse workload on aarch64). Even at 150 ns, Stringzilla would be a 14x improvement over the glibc 2.34 memmem that production systems actually use. Closing the remaining 2x gap vs glibc 2.28 would make it strictly better in all scenarios.Can you contribute to the implementation?
Is your feature request specific to a certain interface?
C implementation
Contact Details
No response
Is there an existing issue for this?
Code of Conduct