Skip to content

Commit 610be2a

Browse files
committed
builtin/log: prefetch necessary blobs for git cherry
In partial clones, `git cherry` fetches necessary blobs on-demand one at a time, which can be very slow. We would like to prefetch all necessary blobs upfront. To do so, we need to be able to first figure out which blobs are needed. `git cherry` does its work in a two-phase approach: first computing header-only IDs (based on file paths and modes), then falling back to full content-based IDs only when header-only IDs collide -- or, more accurately, whenever the oidhash() of the header-only object_ids collide. patch-ids.c handles this by creating an ids->patches hashmap that has all the data we need, but the problem is that any attempt to query the hashmap will invoke the patch_id_neq() function on any colliding objects, which causes the on-demand fetching. Insert a new prefetch_cherry_blobs() function before checking for collisions. Use a temporary replacement on the ids->patches.cmpfn in order to enumerate the blobs that would be needed without yet fetching them, and then fetch them all at once, then restore the old ids->patches.cmpfn. Signed-off-by: Elijah Newren <newren@gmail.com>
1 parent 7f5ac59 commit 610be2a

File tree

3 files changed

+353
-0
lines changed

3 files changed

+353
-0
lines changed

builtin/log.c

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@
2121
#include "color.h"
2222
#include "commit.h"
2323
#include "diff.h"
24+
#include "diffcore.h"
2425
#include "diff-merges.h"
2526
#include "revision.h"
2627
#include "log-tree.h"
2728
#include "oid-array.h"
29+
#include "oidset.h"
2830
#include "tag.h"
2931
#include "reflog-walk.h"
3032
#include "patch-ids.h"
@@ -43,9 +45,11 @@
4345
#include "utf8.h"
4446

4547
#include "commit-reach.h"
48+
#include "promisor-remote.h"
4649
#include "range-diff.h"
4750
#include "tmp-objdir.h"
4851
#include "tree.h"
52+
#include "userdiff.h"
4953
#include "write-or-die.h"
5054

5155
#define MAIL_DEFAULT_WRAP 72
@@ -2602,6 +2606,125 @@ static void print_commit(char sign, struct commit *commit, int verbose,
26022606
}
26032607
}
26042608

2609+
/*
2610+
* Enumerate blob OIDs from a single commit's diff, inserting them into blobs.
2611+
* Skips files whose userdiff driver explicitly declares binary status
2612+
* (drv->binary > 0), since patch-ID uses oid_to_hex() for those and
2613+
* never reads blob content. Use userdiff_find_by_path() since
2614+
* diff_filespec_load_driver() is static in diff.c.
2615+
*
2616+
* Clean up with diff_queue_clear() (from diffcore.h).
2617+
*/
2618+
static void collect_diff_blob_oids(struct commit *commit,
2619+
struct diff_options *opts,
2620+
struct oidset *blobs)
2621+
{
2622+
struct diff_queue_struct *q;
2623+
2624+
/*
2625+
* Merge commits are filtered out by patch_id_defined() in patch-ids.c,
2626+
* so we'll never be called with one.
2627+
*/
2628+
assert(!commit->parents || !commit->parents->next);
2629+
2630+
if (commit->parents)
2631+
diff_tree_oid(&commit->parents->item->object.oid,
2632+
&commit->object.oid, "", opts);
2633+
else
2634+
diff_root_tree_oid(&commit->object.oid, "", opts);
2635+
diffcore_std(opts);
2636+
2637+
q = &diff_queued_diff;
2638+
for (int i = 0; i < q->nr; i++) {
2639+
struct diff_filepair *p = q->queue[i];
2640+
struct userdiff_driver *drv;
2641+
2642+
/* Skip binary files */
2643+
drv = userdiff_find_by_path(opts->repo->index, p->one->path);
2644+
if (drv && drv->binary > 0)
2645+
continue;
2646+
2647+
if (DIFF_FILE_VALID(p->one))
2648+
oidset_insert(blobs, &p->one->oid);
2649+
if (DIFF_FILE_VALID(p->two))
2650+
oidset_insert(blobs, &p->two->oid);
2651+
}
2652+
diff_queue_clear(q);
2653+
}
2654+
2655+
static int always_match(const void *cmp_data UNUSED,
2656+
const struct hashmap_entry *entry1 UNUSED,
2657+
const struct hashmap_entry *entry2 UNUSED,
2658+
const void *keydata UNUSED)
2659+
{
2660+
return 0;
2661+
}
2662+
2663+
/*
2664+
* Prefetch blobs for git cherry in partial clones.
2665+
*
2666+
* Called between the revision walk (which builds the head-side
2667+
* commit list) and the has_commit_patch_id() comparison loop.
2668+
*
2669+
* Uses a cmpfn-swap trick to avoid reading blobs: temporarily
2670+
* replaces the hashmap's comparison function with a trivial
2671+
* always-match function, so hashmap_get()/hashmap_get_next() match
2672+
* any entry with the same oidhash bucket. These are the set of oids
2673+
* that would trigger patch_id_neq() during normal lookup and cause
2674+
* blobs to be read on demand, and we want to prefetch them all at
2675+
* once instead.
2676+
*/
2677+
static void prefetch_cherry_blobs(struct repository *repo,
2678+
struct commit_list *list,
2679+
struct patch_ids *ids)
2680+
{
2681+
struct oidset blobs = OIDSET_INIT;
2682+
hashmap_cmp_fn original_cmpfn;
2683+
2684+
/* Exit if we're not in a partial clone */
2685+
if (!repo_has_promisor_remote(repo))
2686+
return;
2687+
2688+
/* Save original cmpfn, replace with always_match */
2689+
original_cmpfn = ids->patches.cmpfn;
2690+
ids->patches.cmpfn = always_match;
2691+
2692+
/* Find header-only collisions, gather blobs from those commits */
2693+
for (struct commit_list *l = list; l; l = l->next) {
2694+
struct commit *c = l->item;
2695+
bool match_found = false;
2696+
for (struct patch_id *cur = patch_id_iter_first(c, ids);
2697+
cur;
2698+
cur = patch_id_iter_next(cur, ids)) {
2699+
match_found = true;
2700+
collect_diff_blob_oids(cur->commit, &ids->diffopts,
2701+
&blobs);
2702+
}
2703+
if (match_found)
2704+
collect_diff_blob_oids(c, &ids->diffopts, &blobs);
2705+
}
2706+
2707+
/* Restore original cmpfn */
2708+
ids->patches.cmpfn = original_cmpfn;
2709+
2710+
/* If we have any blobs to fetch, fetch them */
2711+
if (oidset_size(&blobs)) {
2712+
struct oid_array to_fetch = OID_ARRAY_INIT;
2713+
struct oidset_iter iter;
2714+
const struct object_id *oid;
2715+
2716+
oidset_iter_init(&blobs, &iter);
2717+
while ((oid = oidset_iter_next(&iter)))
2718+
oid_array_append(&to_fetch, oid);
2719+
2720+
promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
2721+
2722+
oid_array_clear(&to_fetch);
2723+
}
2724+
2725+
oidset_clear(&blobs);
2726+
}
2727+
26052728
int cmd_cherry(int argc,
26062729
const char **argv,
26072730
const char *prefix,
@@ -2673,6 +2796,8 @@ int cmd_cherry(int argc,
26732796
commit_list_insert(commit, &list);
26742797
}
26752798

2799+
prefetch_cherry_blobs(the_repository, list, &ids);
2800+
26762801
for (struct commit_list *l = list; l; l = l->next) {
26772802
char sign = '+';
26782803

Lines changed: 210 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,210 @@
1+
# Design Spec: Batch Blob Prefetch for `git cherry` in Partial Clones
2+
3+
## Problem
4+
5+
In a partial clone with `--filter=blob:none`, `git cherry` compares
6+
commits using patch IDs. Patch IDs are computed in two phases:
7+
8+
1. Header-only: hashes file paths and mode changes only (no blob reads)
9+
2. Full: hashes actual diff content (requires reading blobs)
10+
11+
Phase 2 only runs when two commits have matching header-only IDs
12+
(i.e. they modify the same set of files with the same modes). This
13+
is common — any two commits touching the same file(s) will collide.
14+
15+
When phase 2 needs a blob that isn't local, it triggers an on-demand
16+
promisor fetch. Each fetch is a separate network round-trip. With
17+
many collisions, this means many sequential fetches.
18+
19+
## Solution Overview
20+
21+
Add a preparatory pass before the existing comparison loop in
22+
`cmd_cherry()` that:
23+
24+
1. Identifies which commit pairs will collide on header-only IDs
25+
2. Collects all blob OIDs those commits will need
26+
3. Batch-prefetches them in one fetch
27+
28+
After this pass, the existing comparison loop runs as before, but
29+
all needed blobs are already local, so no on-demand fetches occur.
30+
31+
## Detailed Design
32+
33+
### 1. No struct changes to patch_id
34+
35+
The existing `struct patch_id` and `patch_id_neq()` are not
36+
modified. `is_null_oid()` remains the sentinel for "full ID not
37+
yet computed". No `has_full_patch_id` boolean, no extra fields.
38+
39+
Key insight: `init_patch_id_entry()` stores only `oidhash()` (the
40+
first 4 bytes of the header-only ID) in the hashmap bucket key.
41+
The real `patch_id_neq()` comparison function is invoked only when
42+
`hashmap_get()` or `hashmap_get_next()` finds entries with a
43+
matching oidhash — and that comparison triggers blob reads.
44+
45+
The prefetch needs to detect exactly those oidhash collisions
46+
*without* triggering blob reads. We achieve this by temporarily
47+
swapping the hashmap's comparison function.
48+
49+
### 2. The prefetch function (in builtin/log.c)
50+
51+
This function takes the repository, the head-side commit list (as
52+
built by the existing revision walk in `cmd_cherry()`), and the
53+
patch_ids structure (which contains the upstream entries).
54+
55+
#### 2.1 Early exit
56+
57+
If the repository has no promisor remote, return immediately.
58+
Use `repo_has_promisor_remote()` from promisor-remote.h.
59+
60+
#### 2.2 Swap in a trivial comparison function
61+
62+
Save `ids->patches.cmpfn` (the real `patch_id_neq`) and replace
63+
it with a trivial function that always returns 0 ("equal").
64+
65+
```
66+
static int patch_id_match(const void *unused_cmpfn_data,
67+
const struct hashmap_entry *a,
68+
const struct hashmap_entry *b,
69+
const void *unused_keydata)
70+
{
71+
return 0;
72+
}
73+
```
74+
75+
With this cmpfn in place, `hashmap_get()` and `hashmap_get_next()`
76+
will match every entry in the same oidhash bucket — exactly the
77+
same set that would trigger `patch_id_neq()` during normal lookup.
78+
No blob reads occur because we never call the real comparison
79+
function.
80+
81+
#### 2.3 For each head-side commit, probe for collisions
82+
83+
For each commit in the head-side list:
84+
85+
- Use `patch_id_iter_first(commit, ids)` to probe the upstream
86+
hashmap. This handles `init_patch_id_entry()` + hashmap lookup
87+
internally. With our swapped cmpfn, it returns any upstream
88+
entry whose oidhash matches — i.e. any entry that *would*
89+
trigger `patch_id_neq()` during the real comparison loop.
90+
(Merge commits are already handled — `patch_id_iter_first()`
91+
returns NULL for them via `patch_id_defined()`.)
92+
- If there's a match: collect blob OIDs from the head-side commit
93+
(see section 3).
94+
- Then walk `patch_id_iter_next()` to find ALL upstream entries
95+
in the same bucket. For each, collect blob OIDs from that
96+
upstream commit too. (Multiple upstream commits can share the
97+
same oidhash bucket.)
98+
- Collect blob OIDs from the first upstream match too (from
99+
`patch_id_iter_first()`).
100+
101+
We need blobs from BOTH sides because `patch_id_neq()` computes
102+
full patch IDs for both the upstream and head-side commit when
103+
comparing.
104+
105+
#### 2.4 Restore the original comparison function
106+
107+
Set `ids->patches.cmpfn` back to the saved value (patch_id_neq).
108+
This MUST happen before returning — the subsequent
109+
`has_commit_patch_id()` loop needs the real comparison function.
110+
111+
#### 2.5 Batch prefetch
112+
113+
If the oidset is non-empty, populate an oid_array from it using
114+
`oidset_iter_first()`/`oidset_iter_next()`, then call
115+
`promisor_remote_get_direct(repo, oid_array.oid, oid_array.nr)`.
116+
117+
This is a single network round-trip regardless of how many blobs.
118+
119+
#### 2.6 Cleanup
120+
121+
Free the oid_array and the oidset.
122+
123+
### 3. Collecting blob OIDs from a commit (helper function)
124+
125+
Given a commit, enumerate the blobs its diff touches. Takes an
126+
oidset to insert into (provides automatic dedup — consecutive
127+
commits often share blob OIDs, e.g. B:foo == C^:foo when C's
128+
parent is B).
129+
130+
- Compute the diff: `diff_tree_oid()` for commits with a parent,
131+
`diff_root_tree_oid()` for root commits. Then `diffcore_std()`.
132+
- These populate the global `diff_queued_diff` queue.
133+
- For each filepair in the queue:
134+
- Check the userdiff driver for the file path. If the driver
135+
explicitly declares the file as binary (`drv->binary != -1`),
136+
skip it. Reason: patch-ID uses `oid_to_hex()` for binary
137+
files (see diff.c around line 6652) and never reads the blob.
138+
Use `userdiff_find_by_path()` (NOT `diff_filespec_load_driver`
139+
which is static in diff.c).
140+
- For both sides of the filepair (p->one and p->two): if the
141+
side is valid (`DIFF_FILE_VALID`) and has a non-null OID,
142+
check the dedup oidset — `oidset_insert()` handles dedup
143+
automatically (returns 1 if newly inserted, 0 if duplicate).
144+
- Clear the diff queue with `diff_queue_clear()` (from diffcore.h,
145+
not diff.h).
146+
147+
Note on `drv->binary`: The value -1 means "not set" (auto-detect
148+
at read time by reading the blob); 0 means explicitly text (will
149+
be diffed, blob reads needed); positive means explicitly binary
150+
(patch-ID uses `oid_to_hex()`, no blob read needed).
151+
152+
The correct skip condition is `drv && drv->binary > 0` — skip
153+
only known-binary files. Do NOT use `drv->binary != -1`, which
154+
would also skip explicitly-text files that DO need blob reads.
155+
(The copilot reference implementation uses `!= -1`, which is
156+
technically wrong but harmless in practice since explicit text
157+
attributes are rare.)
158+
159+
### 4. Call site in cmd_cherry()
160+
161+
Insert the call between the revision walk loop (which builds the
162+
head-side commit list) and the comparison loop (which calls
163+
`has_commit_patch_id()`).
164+
165+
### 5. Required includes in builtin/log.c
166+
167+
- promisor-remote.h (for repo_has_promisor_remote,
168+
promisor_remote_get_direct)
169+
- userdiff.h (for userdiff_find_by_path)
170+
- oidset.h (for oidset used in blob OID dedup)
171+
- diffcore.h (for diff_queue_clear)
172+
173+
## Edge Cases
174+
175+
- No promisor remote: early return, zero overhead
176+
- No collisions: probes the hashmap for each head-side commit but
177+
finds no bucket matches, no blobs collected, no fetch issued
178+
- Merge commits in head-side list: skipped (no patch ID defined)
179+
- Root commits (no parent): use diff_root_tree_oid instead of
180+
diff_tree_oid
181+
- Binary files (explicit driver): skipped, patch-ID doesn't read
182+
them
183+
- The cmpfn swap approach matches at oidhash granularity (4 bytes),
184+
which is exactly what the hashmap itself uses to trigger
185+
patch_id_neq(). This means we prefetch for every case the real
186+
code would trigger, plus rare false-positive oidhash collisions
187+
(harmless: we fetch a few extra blobs that won't end up being
188+
compared). No under-fetching is possible.
189+
190+
## Testing
191+
192+
See t/t3500-cherry.sh on the copilot-faster-partial-clones branch
193+
for two tests:
194+
195+
Test 5: "cherry batch-prefetches blobs in partial clone"
196+
- Creates server with 3 upstream + 3 head-side commits modifying
197+
the same file (guarantees collisions)
198+
- Clones with --filter=blob:none
199+
- Runs `git cherry` with GIT_TRACE2_PERF
200+
- Asserts exactly 1 fetch (batch) instead of 6 (individual)
201+
202+
Test 6: "cherry prefetch omits blobs for cherry-picked commits"
203+
- Creates a cherry-pick scenario (divergent branches, shared
204+
commit cherry-picked to head side)
205+
- Verifies `git cherry` correctly identifies the cherry-picked
206+
commit as "-" and head-only commits as "+"
207+
- Important: the head side must diverge before the cherry-pick
208+
so the cherry-pick creates a distinct commit object (otherwise
209+
the commit hash is identical and it's in the symmetric
210+
difference, not needing patch-ID comparison at all)

t/t3500-cherry.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,22 @@ test_expect_success 'cherry ignores whitespace' '
7878
test_cmp expect actual
7979
'
8080

81+
# Reuse the expect file from the previous test, in a partial clone
82+
test_expect_success 'cherry in partial clone does bulk prefetch' '
83+
test_config uploadpack.allowfilter 1 &&
84+
test_config uploadpack.allowanysha1inwant 1 &&
85+
test_when_finished "rm -rf copy" &&
86+
87+
git clone --bare --filter=blob:none file://"$(pwd)" copy &&
88+
(
89+
cd copy &&
90+
GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual &&
91+
test_cmp ../expect actual &&
92+
93+
grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches &&
94+
test_line_count = 1 fetches &&
95+
test_trace2_data promisor fetch_count 4 <trace.output
96+
)
97+
'
98+
8199
test_done

0 commit comments

Comments
 (0)