Skip to content

Commit 6dbfc76

Browse files
committed
grep: prefetch necessary blobs
In partial clones, `git grep` fetches necessary blobs on-demand one at a time, which can be very slow. In partial clones, add an extra preliminary walk over the tree similar to grep_tree() which collects the blobs of interest, and then prefetches them. Signed-off-by: Elijah Newren <newren@gmail.com>
1 parent 610be2a commit 6dbfc76

File tree

2 files changed

+177
-0
lines changed

2 files changed

+177
-0
lines changed

builtin/grep.c

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,12 @@
2828
#include "object-file.h"
2929
#include "object-name.h"
3030
#include "odb.h"
31+
#include "oid-array.h"
32+
#include "oidset.h"
3133
#include "packfile.h"
3234
#include "pager.h"
3335
#include "path.h"
36+
#include "promisor-remote.h"
3437
#include "read-cache-ll.h"
3538
#include "write-or-die.h"
3639

@@ -692,6 +695,143 @@ static int grep_tree(struct grep_opt *opt, const struct pathspec *pathspec,
692695
return hit;
693696
}
694697

698+
static void collect_blob_oids_for_tree(struct repository *repo,
699+
const struct pathspec *pathspec,
700+
struct tree_desc *tree,
701+
struct strbuf *base,
702+
int tn_len,
703+
struct oidset *blob_oids)
704+
{
705+
struct name_entry entry;
706+
int old_baselen = base->len;
707+
struct strbuf name = STRBUF_INIT;
708+
enum interesting match = entry_not_interesting;
709+
710+
while (tree_entry(tree, &entry)) {
711+
if (match != all_entries_interesting) {
712+
strbuf_addstr(&name, base->buf + tn_len);
713+
match = tree_entry_interesting(repo->index,
714+
&entry, &name,
715+
pathspec);
716+
strbuf_reset(&name);
717+
718+
if (match == all_entries_not_interesting)
719+
break;
720+
if (match == entry_not_interesting)
721+
continue;
722+
}
723+
724+
strbuf_add(base, entry.path, tree_entry_len(&entry));
725+
726+
if (S_ISREG(entry.mode)) {
727+
oidset_insert(blob_oids, &entry.oid);
728+
} else if (S_ISDIR(entry.mode)) {
729+
enum object_type type;
730+
struct tree_desc sub_tree;
731+
void *data;
732+
unsigned long size;
733+
734+
data = odb_read_object(repo->objects, &entry.oid,
735+
&type, &size);
736+
if (!data)
737+
die(_("unable to read tree (%s)"),
738+
oid_to_hex(&entry.oid));
739+
740+
strbuf_addch(base, '/');
741+
init_tree_desc(&sub_tree, &entry.oid, data, size);
742+
collect_blob_oids_for_tree(repo, pathspec, &sub_tree,
743+
base, tn_len, blob_oids);
744+
free(data);
745+
}
746+
/*
747+
* ...no else clause for S_ISGITLINK: submodules have their
748+
* own promisor configuration and would need separate fetches
749+
* anyway.
750+
*/
751+
752+
strbuf_setlen(base, old_baselen);
753+
}
754+
755+
strbuf_release(&name);
756+
}
757+
758+
static void collect_blob_oids_for_treeish(struct grep_opt *opt,
759+
const struct pathspec *pathspec,
760+
const struct object_id *tree_ish_oid,
761+
const char *name,
762+
struct oidset *blob_oids)
763+
{
764+
struct tree_desc tree;
765+
void *data;
766+
unsigned long size;
767+
struct strbuf base = STRBUF_INIT;
768+
int len;
769+
770+
data = odb_read_object_peeled(opt->repo->objects, tree_ish_oid,
771+
OBJ_TREE, &size, NULL);
772+
773+
if (!data)
774+
return;
775+
776+
len = name ? strlen(name) : 0;
777+
if (len) {
778+
strbuf_add(&base, name, len);
779+
strbuf_addch(&base, ':');
780+
}
781+
init_tree_desc(&tree, tree_ish_oid, data, size);
782+
783+
collect_blob_oids_for_tree(opt->repo, pathspec, &tree,
784+
&base, base.len, blob_oids);
785+
786+
strbuf_release(&base);
787+
free(data);
788+
}
789+
790+
static void prefetch_grep_blobs(struct grep_opt *opt,
791+
const struct pathspec *pathspec,
792+
const struct object_array *list)
793+
{
794+
struct oidset blob_oids = OIDSET_INIT;
795+
796+
/* Exit if we're not in a partial clone */
797+
if (!repo_has_promisor_remote(opt->repo))
798+
return;
799+
800+
/* For each tree, gather the blobs in it */
801+
for (int i = 0; i < list->nr; i++) {
802+
struct object *real_obj;
803+
804+
obj_read_lock();
805+
real_obj = deref_tag(opt->repo, list->objects[i].item,
806+
NULL, 0);
807+
obj_read_unlock();
808+
809+
if (real_obj &&
810+
(real_obj->type == OBJ_COMMIT ||
811+
real_obj->type == OBJ_TREE))
812+
collect_blob_oids_for_treeish(opt, pathspec,
813+
&real_obj->oid,
814+
list->objects[i].name,
815+
&blob_oids);
816+
}
817+
818+
/* Prefetch the blobs we found */
819+
if (oidset_size(&blob_oids)) {
820+
struct oid_array to_fetch = OID_ARRAY_INIT;
821+
struct oidset_iter iter;
822+
const struct object_id *oid;
823+
824+
oidset_iter_init(&blob_oids, &iter);
825+
while ((oid = oidset_iter_next(&iter)))
826+
oid_array_append(&to_fetch, oid);
827+
828+
promisor_remote_get_direct(opt->repo, to_fetch.oid, to_fetch.nr);
829+
830+
oid_array_clear(&to_fetch);
831+
}
832+
oidset_clear(&blob_oids);
833+
}
834+
695835
static int grep_object(struct grep_opt *opt, const struct pathspec *pathspec,
696836
struct object *obj, const char *name, const char *path)
697837
{
@@ -732,6 +872,8 @@ static int grep_objects(struct grep_opt *opt, const struct pathspec *pathspec,
732872
int hit = 0;
733873
const unsigned int nr = list->nr;
734874

875+
prefetch_grep_blobs(opt, pathspec, list);
876+
735877
for (i = 0; i < nr; i++) {
736878
struct object *real_obj;
737879

t/t7810-grep.sh

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1929,4 +1929,39 @@ test_expect_success 'grep does not report i-t-a and assume unchanged with -L' '
19291929
test_cmp expected actual
19301930
'
19311931

1932+
test_expect_success 'grep of revision in partial clone does bulk prefetch' '
1933+
test_when_finished "rm -rf grep-partial-src grep-partial" &&
1934+
1935+
git init grep-partial-src &&
1936+
(
1937+
cd grep-partial-src &&
1938+
git config uploadpack.allowfilter 1 &&
1939+
git config uploadpack.allowanysha1inwant 1 &&
1940+
echo "needle in haystack" >searchme &&
1941+
echo "no match here" >other &&
1942+
mkdir subdir &&
1943+
echo "needle again" >subdir/deep &&
1944+
git add . &&
1945+
git commit -m "initial"
1946+
) &&
1947+
1948+
git clone --no-checkout --filter=blob:none \
1949+
"file://$(pwd)/grep-partial-src" grep-partial &&
1950+
1951+
# All blobs should be missing after a blobless clone.
1952+
git -C grep-partial rev-list --quiet --objects \
1953+
--missing=print HEAD >missing &&
1954+
test_line_count = 3 missing &&
1955+
1956+
# grep HEAD should batch-prefetch all blobs in one request.
1957+
GIT_TRACE2_EVENT="$(pwd)/grep-trace" \
1958+
git -C grep-partial grep -c "needle" HEAD >result &&
1959+
1960+
# Should find matches in two files.
1961+
test_line_count = 2 result &&
1962+
1963+
# Should have prefetched all 3 objects at once
1964+
test_trace2_data promisor fetch_count 3 <grep-trace
1965+
'
1966+
19321967
test_done

0 commit comments

Comments
 (0)