On Wed, Dec 21, 2022 at 05:04:45AM +0100, Christian Couder wrote: > From: Christian Couder > > After cloning with --filter=, for example to avoid > getting unneeded large files on a user machine, it's possible > that some of these large files still get fetched for some reasons > (like checking out old branches) over time. > > In this case the repo size could grow too much for no good reason > and `git repack --filter=` would be useful to remove > the unneeded large files. > > This command could be dangerous to use though, as it might remove > local objects that haven't been pushed which would lose data and > corrupt the repo. On a server, this command could also corrupt a > repo unless ALL the removed objects aren't already available in > another remote that clients can access. > > To mitigate that risk, we check that a promisor remote has at > least been configured. While this is a nice safeguard, I wonder whether it is sufficient. Suppose you for example have a non-bare repository that already has blobs checked out that would become removed by the filtering repack -- does Git handle this situation gracefully? A quick check seems to indicate that it does. But not quite as well as I'd have hoped: when I switch to a detached HEAD with an arbitrary commit and then execute `git repack --filter=blob:none` then it also removes blobs that are referenced by the currently checked-out commit. This may or may not be what the user is asking for, but I'd rather lean towards this behaviour being surprising. Patrick > Signed-off-by: John Cai > Signed-off-by: Christian Couder > --- > Documentation/git-repack.txt | 8 ++++++++ > builtin/repack.c | 28 +++++++++++++++++++++------- > t/t7700-repack.sh | 15 +++++++++++++++ > 3 files changed, 44 insertions(+), 7 deletions(-) > > diff --git a/Documentation/git-repack.txt b/Documentation/git-repack.txt > index 4017157949..2539ee0a02 100644 > --- a/Documentation/git-repack.txt > +++ b/Documentation/git-repack.txt > @@ -143,6 +143,14 @@ depth is 4095. > a larger and slower repository; see the discussion in > `pack.packSizeLimit`. > > +--filter=:: > + Omits certain objects (usually blobs) from the resulting > + packfile. WARNING: this could easily corrupt the current repo > + and lose data if ANY of the omitted objects hasn't been already > + pushed to a remote. Be very careful about objects that might > + have been created locally! See linkgit:git-rev-list[1] for valid > + `` forms. > + > -b:: > --write-bitmap-index:: > Write a reachability bitmap index as part of the repack. This > diff --git a/builtin/repack.c b/builtin/repack.c > index c1402ad038..8e5ac9c171 100644 > --- a/builtin/repack.c > +++ b/builtin/repack.c > @@ -49,6 +49,7 @@ struct pack_objects_args { > const char *depth; > const char *threads; > const char *max_pack_size; > + const char *filter; > int no_reuse_delta; > int no_reuse_object; > int quiet; > @@ -163,6 +164,8 @@ static void prepare_pack_objects(struct child_process *cmd, > strvec_pushf(&cmd->args, "--threads=%s", args->threads); > if (args->max_pack_size) > strvec_pushf(&cmd->args, "--max-pack-size=%s", args->max_pack_size); > + if (args->filter) > + strvec_pushf(&cmd->args, "--filter=%s", args->filter); > if (args->no_reuse_delta) > strvec_pushf(&cmd->args, "--no-reuse-delta"); > if (args->no_reuse_object) > @@ -234,6 +237,13 @@ static struct generated_pack_data *populate_pack_exts(const char *name) > return data; > } > > +static void write_promisor_file_1(char *p) > +{ > + char *promisor_name = mkpathdup("%s-%s.promisor", packtmp, p); > + write_promisor_file(promisor_name, NULL, 0); > + free(promisor_name); > +} > + > static void repack_promisor_objects(const struct pack_objects_args *args, > struct string_list *names) > { > @@ -265,7 +275,6 @@ static void repack_promisor_objects(const struct pack_objects_args *args, > out = xfdopen(cmd.out, "r"); > while (strbuf_getline_lf(&line, out) != EOF) { > struct string_list_item *item; > - char *promisor_name; > > if (line.len != the_hash_algo->hexsz) > die(_("repack: Expecting full hex object ID lines only from pack-objects.")); > @@ -282,13 +291,8 @@ static void repack_promisor_objects(const struct pack_objects_args *args, > * concatenate the contents of all .promisor files instead of > * just creating a new empty file. > */ > - promisor_name = mkpathdup("%s-%s.promisor", packtmp, > - line.buf); > - write_promisor_file(promisor_name, NULL, 0); > - > + write_promisor_file_1(line.buf); > item->util = populate_pack_exts(item->string); > - > - free(promisor_name); > } > fclose(out); > if (finish_command(&cmd)) > @@ -800,6 +804,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) > N_("limits the maximum number of threads")), > OPT_STRING(0, "max-pack-size", &po_args.max_pack_size, N_("bytes"), > N_("maximum size of each packfile")), > + OPT_STRING(0, "filter", &po_args.filter, N_("args"), > + N_("object filtering")), > OPT_BOOL(0, "pack-kept-objects", &pack_kept_objects, > N_("repack objects in packs marked with .keep")), > OPT_STRING_LIST(0, "keep-pack", &keep_pack_list, N_("name"), > @@ -834,6 +840,12 @@ int cmd_repack(int argc, const char **argv, const char *prefix) > die(_("options '%s' and '%s' cannot be used together"), "--cruft", "-k"); > } > > + if (po_args.filter && !has_promisor_remote()) > + die("a promisor remote must be setup\n" > + "Also please push all the objects " > + "that might be filtered to that remote!\n" > + "Otherwise they will be lost!"); > + > if (write_bitmaps < 0) { > if (!write_midx && > (!(pack_everything & ALL_INTO_ONE) || !is_bare_repository())) > @@ -971,6 +983,8 @@ int cmd_repack(int argc, const char **argv, const char *prefix) > if (line.len != the_hash_algo->hexsz) > die(_("repack: Expecting full hex object ID lines only from pack-objects.")); > item = string_list_append(&names, line.buf); > + if (po_args.filter) > + write_promisor_file_1(line.buf); > item->util = populate_pack_exts(item->string); > } > strbuf_release(&line); > diff --git a/t/t7700-repack.sh b/t/t7700-repack.sh > index 4aabe98139..3a6ad9f623 100755 > --- a/t/t7700-repack.sh > +++ b/t/t7700-repack.sh > @@ -253,6 +253,21 @@ test_expect_success 'auto-bitmaps do not complain if unavailable' ' > test_must_be_empty actual > ' > > +test_expect_success 'repacking with a filter works' ' > + test_when_finished "rm -rf server client" && > + test_create_repo server && > + git -C server config uploadpack.allowFilter true && > + git -C server config uploadpack.allowAnySHA1InWant true && > + test_commit -C server 1 && > + git clone --bare --no-local server client && > + git -C client config remote.origin.promisor true && > + git -C client rev-list --objects --all --missing=print >objects && > + test $(grep -c "^?" objects) = 0 && > + git -C client -c repack.writebitmaps=false repack -a -d --filter=blob:none && > + git -C client rev-list --objects --all --missing=print >objects && > + test $(grep -c "^?" objects) = 1 > +' > + > objdir=.git/objects > midx=$objdir/pack/multi-pack-index > > -- > 2.39.0.59.g395bcb85bc.dirty >