Skip to content

Commit a705852

Browse files
committed
builtin/log: prefetch necessary blobs for git cherry
In partial clones, `git cherry` fetches necessary blobs on-demand one at a time, which can be very slow. We would like to prefetch all necessary blobs upfront. To do so, we need to be able to first figure out which blobs are needed. `git cherry` does its work in a two-phase approach: first computing header-only IDs (based on file paths and modes), then falling back to full content-based IDs only when header-only IDs collide -- or, more accurately, whenever the oidhash() of the header-only object_ids collide. patch-ids.c handles this by creating an ids->patches hashmap that has all the data we need, but the problem is that any attempt to query the hashmap will invoke the patch_id_neq() function on any colliding objects, which causes the on-demand fetching. Insert a new prefetch_cherry_blobs() function before checking for collisions. Use a temporary replacement on the ids->patches.cmpfn in order to enumerate the blobs that would be needed without yet fetching them, and then fetch them all at once, then restore the old ids->patches.cmpfn. Signed-off-by: Elijah Newren <newren@gmail.com>
1 parent 663816a commit a705852

File tree

2 files changed

+143
-0
lines changed

2 files changed

+143
-0
lines changed

builtin/log.c

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,12 @@
2121
#include "color.h"
2222
#include "commit.h"
2323
#include "diff.h"
24+
#include "diffcore.h"
2425
#include "diff-merges.h"
2526
#include "revision.h"
2627
#include "log-tree.h"
2728
#include "oid-array.h"
29+
#include "oidset.h"
2830
#include "tag.h"
2931
#include "reflog-walk.h"
3032
#include "patch-ids.h"
@@ -43,9 +45,11 @@
4345
#include "utf8.h"
4446

4547
#include "commit-reach.h"
48+
#include "promisor-remote.h"
4649
#include "range-diff.h"
4750
#include "tmp-objdir.h"
4851
#include "tree.h"
52+
#include "userdiff.h"
4953
#include "write-or-die.h"
5054

5155
#define MAIL_DEFAULT_WRAP 72
@@ -2602,6 +2606,125 @@ static void print_commit(char sign, struct commit *commit, int verbose,
26022606
}
26032607
}
26042608

2609+
/*
2610+
* Enumerate blob OIDs from a single commit's diff, inserting them into blobs.
2611+
* Skips files whose userdiff driver explicitly declares binary status
2612+
* (drv->binary > 0), since patch-ID uses oid_to_hex() for those and
2613+
* never reads blob content. Use userdiff_find_by_path() since
2614+
* diff_filespec_load_driver() is static in diff.c.
2615+
*
2616+
* Clean up with diff_queue_clear() (from diffcore.h).
2617+
*/
2618+
static void collect_diff_blob_oids(struct commit *commit,
2619+
struct diff_options *opts,
2620+
struct oidset *blobs)
2621+
{
2622+
struct diff_queue_struct *q;
2623+
2624+
/*
2625+
* Merge commits are filtered out by patch_id_defined() in patch-ids.c,
2626+
* so we'll never be called with one.
2627+
*/
2628+
assert(!commit->parents || !commit->parents->next);
2629+
2630+
if (commit->parents)
2631+
diff_tree_oid(&commit->parents->item->object.oid,
2632+
&commit->object.oid, "", opts);
2633+
else
2634+
diff_root_tree_oid(&commit->object.oid, "", opts);
2635+
diffcore_std(opts);
2636+
2637+
q = &diff_queued_diff;
2638+
for (int i = 0; i < q->nr; i++) {
2639+
struct diff_filepair *p = q->queue[i];
2640+
struct userdiff_driver *drv;
2641+
2642+
/* Skip binary files */
2643+
drv = userdiff_find_by_path(opts->repo->index, p->one->path);
2644+
if (drv && drv->binary > 0)
2645+
continue;
2646+
2647+
if (DIFF_FILE_VALID(p->one))
2648+
oidset_insert(blobs, &p->one->oid);
2649+
if (DIFF_FILE_VALID(p->two))
2650+
oidset_insert(blobs, &p->two->oid);
2651+
}
2652+
diff_queue_clear(q);
2653+
}
2654+
2655+
static int always_match(const void *cmp_data UNUSED,
2656+
const struct hashmap_entry *entry1 UNUSED,
2657+
const struct hashmap_entry *entry2 UNUSED,
2658+
const void *keydata UNUSED)
2659+
{
2660+
return 0;
2661+
}
2662+
2663+
/*
2664+
* Prefetch blobs for git cherry in partial clones.
2665+
*
2666+
* Called between the revision walk (which builds the head-side
2667+
* commit list) and the has_commit_patch_id() comparison loop.
2668+
*
2669+
* Uses a cmpfn-swap trick to avoid reading blobs: temporarily
2670+
* replaces the hashmap's comparison function with a trivial
2671+
* always-match function, so hashmap_get()/hashmap_get_next() match
2672+
* any entry with the same oidhash bucket. These are the set of oids
2673+
* that would trigger patch_id_neq() during normal lookup and cause
2674+
* blobs to be read on demand, and we want to prefetch them all at
2675+
* once instead.
2676+
*/
2677+
static void prefetch_cherry_blobs(struct repository *repo,
2678+
struct commit_list *list,
2679+
struct patch_ids *ids)
2680+
{
2681+
struct oidset blobs = OIDSET_INIT;
2682+
hashmap_cmp_fn original_cmpfn;
2683+
2684+
/* Exit if we're not in a partial clone */
2685+
if (!repo_has_promisor_remote(repo))
2686+
return;
2687+
2688+
/* Save original cmpfn, replace with always_match */
2689+
original_cmpfn = ids->patches.cmpfn;
2690+
ids->patches.cmpfn = always_match;
2691+
2692+
/* Find header-only collisions, gather blobs from those commits */
2693+
for (struct commit_list *l = list; l; l = l->next) {
2694+
struct commit *c = l->item;
2695+
bool match_found = false;
2696+
for (struct patch_id *cur = patch_id_iter_first(c, ids);
2697+
cur;
2698+
cur = patch_id_iter_next(cur, ids)) {
2699+
match_found = true;
2700+
collect_diff_blob_oids(cur->commit, &ids->diffopts,
2701+
&blobs);
2702+
}
2703+
if (match_found)
2704+
collect_diff_blob_oids(c, &ids->diffopts, &blobs);
2705+
}
2706+
2707+
/* Restore original cmpfn */
2708+
ids->patches.cmpfn = original_cmpfn;
2709+
2710+
/* If we have any blobs to fetch, fetch them */
2711+
if (oidset_size(&blobs)) {
2712+
struct oid_array to_fetch = OID_ARRAY_INIT;
2713+
struct oidset_iter iter;
2714+
const struct object_id *oid;
2715+
2716+
oidset_iter_init(&blobs, &iter);
2717+
while ((oid = oidset_iter_next(&iter)))
2718+
oid_array_append(&to_fetch, oid);
2719+
2720+
promisor_remote_get_direct(repo, to_fetch.oid, to_fetch.nr);
2721+
2722+
oid_array_clear(&to_fetch);
2723+
}
2724+
2725+
oidset_clear(&blobs);
2726+
}
2727+
26052728
int cmd_cherry(int argc,
26062729
const char **argv,
26072730
const char *prefix,
@@ -2673,6 +2796,8 @@ int cmd_cherry(int argc,
26732796
commit_list_insert(commit, &list);
26742797
}
26752798

2799+
prefetch_cherry_blobs(the_repository, list, &ids);
2800+
26762801
for (struct commit_list *l = list; l; l = l->next) {
26772802
char sign = '+';
26782803

t/t3500-cherry.sh

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,4 +78,22 @@ test_expect_success 'cherry ignores whitespace' '
7878
test_cmp expect actual
7979
'
8080

81+
# Reuse the expect file from the previous test, in a partial clone
82+
test_expect_success 'cherry in partial clone does bulk prefetch' '
83+
test_config uploadpack.allowfilter 1 &&
84+
test_config uploadpack.allowanysha1inwant 1 &&
85+
test_when_finished "rm -rf copy" &&
86+
87+
git clone --bare --filter=blob:none file://"$(pwd)" copy &&
88+
(
89+
cd copy &&
90+
GIT_TRACE2_EVENT="$(pwd)/trace.output" git cherry upstream-with-space feature-without-space >actual &&
91+
test_cmp ../expect actual &&
92+
93+
grep "child_start.*fetch.negotiationAlgorithm" trace.output >fetches &&
94+
test_line_count = 1 fetches &&
95+
test_trace2_data promisor fetch_count 4 <trace.output
96+
)
97+
'
98+
8199
test_done

0 commit comments

Comments
 (0)