@@ -45,9 +45,14 @@ impl BloomKey {
4545 }
4646
4747 fn from_bytes ( path : & [ u8 ] , settings : BloomFilterSettings ) -> Self {
48+ let ( h0, h1) = match settings. hash_version {
49+ 1 => ( murmur3_v1 ( SEED0 , path) , murmur3_v1 ( SEED1 , path) ) ,
50+ 2 => ( murmur3_v2 ( SEED0 , path) , murmur3_v2 ( SEED1 , path) ) ,
51+ version => panic ! ( "BUG: unsupported Bloom hash version {version} should have been filtered earlier" ) ,
52+ } ;
4853 Self {
49- h0 : murmur3_v2 ( SEED0 , path ) ,
50- h1 : murmur3_v2 ( SEED1 , path ) ,
54+ h0,
55+ h1,
5156 num_hashes : settings. num_hashes ,
5257 }
5358 }
@@ -144,16 +149,91 @@ impl Graph {
144149 }
145150}
146151
152+ pub ( crate ) fn murmur3_v1 ( seed : u32 , data : & [ u8 ] ) -> u32 {
153+ const C1 : u32 = 0xcc9e_2d51 ;
154+ const C2 : u32 = 0x1b87_3593 ;
155+ const R1 : u32 = 15 ;
156+ const R2 : u32 = 13 ;
157+ const M : u32 = 5 ;
158+ const N : u32 = 0xe654_6b64 ;
159+
160+ fn byte_to_u32 ( byte : u8 ) -> u32 {
161+ u32:: from_ne_bytes ( i32:: from ( i8:: from_ne_bytes ( [ byte] ) ) . to_ne_bytes ( ) )
162+ }
163+
164+ let mut seed = seed;
165+ let chunks = data. chunks_exact ( 4 ) ;
166+ let tail = chunks. remainder ( ) ;
167+ for chunk in chunks {
168+ let byte1 = byte_to_u32 ( chunk[ 0 ] ) ;
169+ let byte2 = byte_to_u32 ( chunk[ 1 ] ) << 8 ;
170+ let byte3 = byte_to_u32 ( chunk[ 2 ] ) << 16 ;
171+ let byte4 = byte_to_u32 ( chunk[ 3 ] ) << 24 ;
172+ let mut k = byte1 | byte2 | byte3 | byte4;
173+ k = k. wrapping_mul ( C1 ) ;
174+ k = k. rotate_left ( R1 ) ;
175+ k = k. wrapping_mul ( C2 ) ;
176+
177+ seed ^= k;
178+ seed = seed. rotate_left ( R2 ) . wrapping_mul ( M ) . wrapping_add ( N ) ;
179+ }
180+
181+ let mut k1 = 0u32 ;
182+ match tail. len ( ) {
183+ 3 => {
184+ k1 ^= byte_to_u32 ( tail[ 2 ] ) << 16 ;
185+ k1 ^= byte_to_u32 ( tail[ 1 ] ) << 8 ;
186+ k1 ^= byte_to_u32 ( tail[ 0 ] ) ;
187+ }
188+ 2 => {
189+ k1 ^= byte_to_u32 ( tail[ 1 ] ) << 8 ;
190+ k1 ^= byte_to_u32 ( tail[ 0 ] ) ;
191+ }
192+ 1 => {
193+ k1 ^= byte_to_u32 ( tail[ 0 ] ) ;
194+ }
195+ 0 => { }
196+ _ => unreachable ! ( "remainder is shorter than 4 bytes" ) ,
197+ }
198+ if !tail. is_empty ( ) {
199+ k1 = k1. wrapping_mul ( C1 ) ;
200+ k1 = k1. rotate_left ( R1 ) ;
201+ k1 = k1. wrapping_mul ( C2 ) ;
202+ seed ^= k1;
203+ }
204+
205+ seed ^= data. len ( ) as u32 ;
206+ seed ^= seed >> 16 ;
207+ seed = seed. wrapping_mul ( 0x85eb_ca6b ) ;
208+ seed ^= seed >> 13 ;
209+ seed = seed. wrapping_mul ( 0xc2b2_ae35 ) ;
210+ seed ^= seed >> 16 ;
211+ seed
212+ }
213+
147214pub ( crate ) fn murmur3_v2 ( seed : u32 , data : & [ u8 ] ) -> u32 {
148215 let mut reader = Cursor :: new ( data) ;
149216 murmur3:: murmur3_32 ( & mut reader, seed) . expect ( "reading from memory does not fail" )
150217}
151218#[ cfg( test) ]
152219mod tests {
153- use super :: { murmur3_v2, BloomKey } ;
220+ use super :: { murmur3_v2, BloomKey , BITS_PER_WORD } ;
154221 use crate :: BloomFilterSettings ;
155222 use bstr:: BStr ;
156223
224+ fn filter_bytes_for_path ( path : & [ u8 ] , settings : BloomFilterSettings , len : usize ) -> Vec < u8 > {
225+ let key = BloomKey :: from_path ( BStr :: new ( path) , settings) ;
226+ let mut out = vec ! [ 0u8 ; len] ;
227+ let modulo = ( len as u64 ) * BITS_PER_WORD ;
228+ for i in 0 ..key. num_hashes {
229+ let hash = key. h0 . wrapping_add ( i. wrapping_mul ( key. h1 ) ) ;
230+ let bit_pos = u64:: from ( hash) % modulo;
231+ let byte_pos = ( bit_pos / BITS_PER_WORD ) as usize ;
232+ out[ byte_pos] |= 1u8 << ( bit_pos % BITS_PER_WORD ) ;
233+ }
234+ out
235+ }
236+
157237 #[ test]
158238 fn murmur3_known_vectors_match_git_and_reference_values ( ) {
159239 assert_eq ! ( murmur3_v2( 0 , b"" ) , 0x0000_0000 ) ;
@@ -165,7 +245,36 @@ mod tests {
165245 }
166246
167247 #[ test]
168- fn bloom_key_for_empty_path_matches_git_vector ( ) {
248+ fn murmur3_v2_matches_git_high_bit_vector ( ) {
249+ assert_eq ! ( murmur3_v2( 0 , b"\x99 \xaa \xbb \xcc \xdd \xee \xff " ) , 0xa183_ccfd ) ;
250+ }
251+
252+ #[ test]
253+ fn bloom_key_for_empty_path_matches_git_v1_vector ( ) {
254+ let settings = BloomFilterSettings {
255+ hash_version : 1 ,
256+ num_hashes : 7 ,
257+ bits_per_entry : 10 ,
258+ } ;
259+ let key = BloomKey :: from_path ( BStr :: new ( b"" ) , settings) ;
260+ assert_eq ! (
261+ ( 0 ..key. num_hashes)
262+ . map( |i| key. h0. wrapping_add( i. wrapping_mul( key. h1) ) )
263+ . collect:: <Vec <_>>( ) ,
264+ & [
265+ 0x5615_800c ,
266+ 0x5b96_6560 ,
267+ 0x6117_4ab4 ,
268+ 0x6698_3008 ,
269+ 0x6c19_155c ,
270+ 0x7199_fab0 ,
271+ 0x771a_e004
272+ ]
273+ ) ;
274+ }
275+
276+ #[ test]
277+ fn bloom_key_for_empty_path_matches_git_v2_vector ( ) {
169278 let settings = BloomFilterSettings {
170279 hash_version : 2 ,
171280 num_hashes : 7 ,
@@ -187,4 +296,55 @@ mod tests {
187296 ]
188297 ) ;
189298 }
299+
300+ #[ test]
301+ fn bloom_key_for_high_bit_path_differs_between_versions ( ) {
302+ let path = BStr :: new ( b"\xc2 \xa2 " ) ;
303+ let v1 = BloomKey :: from_path (
304+ path,
305+ BloomFilterSettings {
306+ hash_version : 1 ,
307+ num_hashes : 7 ,
308+ bits_per_entry : 10 ,
309+ } ,
310+ ) ;
311+ let v2 = BloomKey :: from_path (
312+ path,
313+ BloomFilterSettings {
314+ hash_version : 2 ,
315+ num_hashes : 7 ,
316+ bits_per_entry : 10 ,
317+ } ,
318+ ) ;
319+ assert_ne ! ( v1, v2) ;
320+ }
321+
322+ #[ test]
323+ fn bloom_filter_for_high_bit_path_matches_git_v1_and_v2_vectors ( ) {
324+ let path = b"\xc2 \xa2 " ;
325+ assert_eq ! (
326+ filter_bytes_for_path(
327+ path,
328+ BloomFilterSettings {
329+ hash_version: 1 ,
330+ num_hashes: 7 ,
331+ bits_per_entry: 10 ,
332+ } ,
333+ 2 ,
334+ ) ,
335+ vec![ 0x52 , 0xa9 ]
336+ ) ;
337+ assert_eq ! (
338+ filter_bytes_for_path(
339+ path,
340+ BloomFilterSettings {
341+ hash_version: 2 ,
342+ num_hashes: 7 ,
343+ bits_per_entry: 10 ,
344+ } ,
345+ 2 ,
346+ ) ,
347+ vec![ 0xc0 , 0x1f ]
348+ ) ;
349+ }
190350}
0 commit comments