Skip to content

Commit 5b94604

Browse files
committed
gix-commitgraph: add support for v1 hashes too
Signed-off-by: Vicent Marti <vmg@strn.cat>
1 parent a85c1fe commit 5b94604

File tree

12 files changed

+364
-7
lines changed

12 files changed

+364
-7
lines changed

gix-commitgraph/src/bloom.rs

Lines changed: 164 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,9 +45,14 @@ impl BloomKey {
4545
}
4646

4747
fn from_bytes(path: &[u8], settings: BloomFilterSettings) -> Self {
48+
let (h0, h1) = match settings.hash_version {
49+
1 => (murmur3_v1(SEED0, path), murmur3_v1(SEED1, path)),
50+
2 => (murmur3_v2(SEED0, path), murmur3_v2(SEED1, path)),
51+
version => panic!("BUG: unsupported Bloom hash version {version} should have been filtered earlier"),
52+
};
4853
Self {
49-
h0: murmur3_v2(SEED0, path),
50-
h1: murmur3_v2(SEED1, path),
54+
h0,
55+
h1,
5156
num_hashes: settings.num_hashes,
5257
}
5358
}
@@ -144,16 +149,91 @@ impl Graph {
144149
}
145150
}
146151

152+
pub(crate) fn murmur3_v1(seed: u32, data: &[u8]) -> u32 {
153+
const C1: u32 = 0xcc9e_2d51;
154+
const C2: u32 = 0x1b87_3593;
155+
const R1: u32 = 15;
156+
const R2: u32 = 13;
157+
const M: u32 = 5;
158+
const N: u32 = 0xe654_6b64;
159+
160+
fn byte_to_u32(byte: u8) -> u32 {
161+
u32::from_ne_bytes(i32::from(i8::from_ne_bytes([byte])).to_ne_bytes())
162+
}
163+
164+
let mut seed = seed;
165+
let chunks = data.chunks_exact(4);
166+
let tail = chunks.remainder();
167+
for chunk in chunks {
168+
let byte1 = byte_to_u32(chunk[0]);
169+
let byte2 = byte_to_u32(chunk[1]) << 8;
170+
let byte3 = byte_to_u32(chunk[2]) << 16;
171+
let byte4 = byte_to_u32(chunk[3]) << 24;
172+
let mut k = byte1 | byte2 | byte3 | byte4;
173+
k = k.wrapping_mul(C1);
174+
k = k.rotate_left(R1);
175+
k = k.wrapping_mul(C2);
176+
177+
seed ^= k;
178+
seed = seed.rotate_left(R2).wrapping_mul(M).wrapping_add(N);
179+
}
180+
181+
let mut k1 = 0u32;
182+
match tail.len() {
183+
3 => {
184+
k1 ^= byte_to_u32(tail[2]) << 16;
185+
k1 ^= byte_to_u32(tail[1]) << 8;
186+
k1 ^= byte_to_u32(tail[0]);
187+
}
188+
2 => {
189+
k1 ^= byte_to_u32(tail[1]) << 8;
190+
k1 ^= byte_to_u32(tail[0]);
191+
}
192+
1 => {
193+
k1 ^= byte_to_u32(tail[0]);
194+
}
195+
0 => {}
196+
_ => unreachable!("remainder is shorter than 4 bytes"),
197+
}
198+
if !tail.is_empty() {
199+
k1 = k1.wrapping_mul(C1);
200+
k1 = k1.rotate_left(R1);
201+
k1 = k1.wrapping_mul(C2);
202+
seed ^= k1;
203+
}
204+
205+
seed ^= data.len() as u32;
206+
seed ^= seed >> 16;
207+
seed = seed.wrapping_mul(0x85eb_ca6b);
208+
seed ^= seed >> 13;
209+
seed = seed.wrapping_mul(0xc2b2_ae35);
210+
seed ^= seed >> 16;
211+
seed
212+
}
213+
147214
pub(crate) fn murmur3_v2(seed: u32, data: &[u8]) -> u32 {
148215
let mut reader = Cursor::new(data);
149216
murmur3::murmur3_32(&mut reader, seed).expect("reading from memory does not fail")
150217
}
151218
#[cfg(test)]
152219
mod tests {
153-
use super::{murmur3_v2, BloomKey};
220+
use super::{murmur3_v2, BloomKey, BITS_PER_WORD};
154221
use crate::BloomFilterSettings;
155222
use bstr::BStr;
156223

224+
fn filter_bytes_for_path(path: &[u8], settings: BloomFilterSettings, len: usize) -> Vec<u8> {
225+
let key = BloomKey::from_path(BStr::new(path), settings);
226+
let mut out = vec![0u8; len];
227+
let modulo = (len as u64) * BITS_PER_WORD;
228+
for i in 0..key.num_hashes {
229+
let hash = key.h0.wrapping_add(i.wrapping_mul(key.h1));
230+
let bit_pos = u64::from(hash) % modulo;
231+
let byte_pos = (bit_pos / BITS_PER_WORD) as usize;
232+
out[byte_pos] |= 1u8 << (bit_pos % BITS_PER_WORD);
233+
}
234+
out
235+
}
236+
157237
#[test]
158238
fn murmur3_known_vectors_match_git_and_reference_values() {
159239
assert_eq!(murmur3_v2(0, b""), 0x0000_0000);
@@ -165,7 +245,36 @@ mod tests {
165245
}
166246

167247
#[test]
168-
fn bloom_key_for_empty_path_matches_git_vector() {
248+
fn murmur3_v2_matches_git_high_bit_vector() {
249+
assert_eq!(murmur3_v2(0, b"\x99\xaa\xbb\xcc\xdd\xee\xff"), 0xa183_ccfd);
250+
}
251+
252+
#[test]
253+
fn bloom_key_for_empty_path_matches_git_v1_vector() {
254+
let settings = BloomFilterSettings {
255+
hash_version: 1,
256+
num_hashes: 7,
257+
bits_per_entry: 10,
258+
};
259+
let key = BloomKey::from_path(BStr::new(b""), settings);
260+
assert_eq!(
261+
(0..key.num_hashes)
262+
.map(|i| key.h0.wrapping_add(i.wrapping_mul(key.h1)))
263+
.collect::<Vec<_>>(),
264+
&[
265+
0x5615_800c,
266+
0x5b96_6560,
267+
0x6117_4ab4,
268+
0x6698_3008,
269+
0x6c19_155c,
270+
0x7199_fab0,
271+
0x771a_e004
272+
]
273+
);
274+
}
275+
276+
#[test]
277+
fn bloom_key_for_empty_path_matches_git_v2_vector() {
169278
let settings = BloomFilterSettings {
170279
hash_version: 2,
171280
num_hashes: 7,
@@ -187,4 +296,55 @@ mod tests {
187296
]
188297
);
189298
}
299+
300+
#[test]
301+
fn bloom_key_for_high_bit_path_differs_between_versions() {
302+
let path = BStr::new(b"\xc2\xa2");
303+
let v1 = BloomKey::from_path(
304+
path,
305+
BloomFilterSettings {
306+
hash_version: 1,
307+
num_hashes: 7,
308+
bits_per_entry: 10,
309+
},
310+
);
311+
let v2 = BloomKey::from_path(
312+
path,
313+
BloomFilterSettings {
314+
hash_version: 2,
315+
num_hashes: 7,
316+
bits_per_entry: 10,
317+
},
318+
);
319+
assert_ne!(v1, v2);
320+
}
321+
322+
#[test]
323+
fn bloom_filter_for_high_bit_path_matches_git_v1_and_v2_vectors() {
324+
let path = b"\xc2\xa2";
325+
assert_eq!(
326+
filter_bytes_for_path(
327+
path,
328+
BloomFilterSettings {
329+
hash_version: 1,
330+
num_hashes: 7,
331+
bits_per_entry: 10,
332+
},
333+
2,
334+
),
335+
vec![0x52, 0xa9]
336+
);
337+
assert_eq!(
338+
filter_bytes_for_path(
339+
path,
340+
BloomFilterSettings {
341+
hash_version: 2,
342+
num_hashes: 7,
343+
bits_per_entry: 10,
344+
},
345+
2,
346+
),
347+
vec![0xc0, 0x1f]
348+
);
349+
}
190350
}

gix-commitgraph/src/file/init.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,9 @@ impl File {
148148
bits_per_entry: from_be_u32(&data[data_range.start + 8..][..4]),
149149
};
150150
let bloom_data_payload_len = data_range.len() - BLOOM_FILTER_HEADER_SIZE;
151-
if bloom_index_offsets_are_valid(&data[index_range.clone()], bloom_data_payload_len) {
151+
if settings.is_supported()
152+
&& bloom_index_offsets_are_valid(&data[index_range.clone()], bloom_data_payload_len)
153+
{
152154
bloom_filter_settings = Some(settings);
153155
bloom_filter_data_len = bloom_data_payload_len;
154156
bloom_filter_data_offset = Some(data_range.start + BLOOM_FILTER_HEADER_SIZE);

gix-commitgraph/src/lib.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,17 @@ pub struct BloomFilterSettings {
5050
pub bits_per_entry: u32,
5151
}
5252

53+
impl BloomFilterSettings {
54+
pub(crate) fn is_supported(&self) -> bool {
55+
match self.hash_version {
56+
// Git's changed-path Bloom filter v1 hashes are deprecated, but we still need to
57+
// read them to match Git's historical commit-graph behavior.
58+
1 | 2 => true,
59+
_ => false,
60+
}
61+
}
62+
}
63+
5364
/// A complete commit graph.
5465
///
5566
/// The data in the commit graph may come from a monolithic `objects/info/commit-graph` file, or it

0 commit comments

Comments
 (0)