Git Mailing List Archive mirror
 help / color / mirror / Atom feed
From: "Derrick Stolee via GitGitGadget" <gitgitgadget@gmail.com>
To: git@vger.kernel.org
Cc: christian.couder@gmail.com, gitster@pobox.com,
	johannes.schindelin@gmx.de, johncai86@gmail.com,
	jonathantanmy@google.com, karthik.188@gmail.com,
	kristofferhaugsbakk@fastmail.com, me@ttaylorr.com,
	newren@gmail.com, peff@peff.net, ps@pks.im,
	Derrick Stolee <stolee@gmail.com>,
	Derrick Stolee <stolee@gmail.com>
Subject: [PATCH 10/13] pack-objects: refactor path-walk delta phase
Date: Mon, 10 Mar 2025 01:50:52 +0000	[thread overview]
Message-ID: <c047fbd7f275ef79695b5e1356075705e6fd7bc8.1741571455.git.gitgitgadget@gmail.com> (raw)
In-Reply-To: <pull.1819.git.1741571455.gitgitgadget@gmail.com>

From: Derrick Stolee <stolee@gmail.com>

Previously, the --path-walk option to 'git pack-objects' would compute
deltas inline with the path-walk logic. This would make the progress
indicator look like it is taking a long time to enumerate objects, and
then very quickly computed deltas.

Instead of computing deltas on each region of objects organized by tree,
store a list of regions corresponding to these groups. These can later
be pulled from the list for delta compression before doing the "global"
delta search.

This presents a new progress indicator that can be used in tests to
verify that this stage is happening.

The current implementation is not integrated with threads, but could be
done in a future update.

Since we do not attempt to sort objects by size until after exploring
all trees, we can remove the previous change to t5530 due to a different
error message appearing first.

Signed-off-by: Derrick Stolee <stolee@gmail.com>
---
 builtin/pack-objects.c       | 82 +++++++++++++++++++++++++-----------
 pack-objects.h               | 12 ++++++
 t/t5300-pack-object.sh       |  8 +++-
 t/t5530-upload-pack-error.sh |  6 ---
 4 files changed, 75 insertions(+), 33 deletions(-)

diff --git a/builtin/pack-objects.c b/builtin/pack-objects.c
index c756ce50dd7..c5a3129c88e 100644
--- a/builtin/pack-objects.c
+++ b/builtin/pack-objects.c
@@ -3233,6 +3233,51 @@ static int should_attempt_deltas(struct object_entry *entry)
 	return 1;
 }
 
+static void find_deltas_for_region(struct object_entry *list UNUSED,
+				   struct packing_region *region,
+				   unsigned int *processed)
+{
+	struct object_entry **delta_list;
+	uint32_t delta_list_nr = 0;
+
+	ALLOC_ARRAY(delta_list, region->nr);
+	for (uint32_t i = 0; i < region->nr; i++) {
+		struct object_entry *entry = to_pack.objects + region->start + i;
+		if (should_attempt_deltas(entry))
+			delta_list[delta_list_nr++] = entry;
+	}
+
+	QSORT(delta_list, delta_list_nr, type_size_sort);
+	find_deltas(delta_list, &delta_list_nr, window, depth, processed);
+	free(delta_list);
+}
+
+static void find_deltas_by_region(struct object_entry *list,
+				  struct packing_region *regions,
+				  uint32_t start, uint32_t nr)
+{
+	unsigned int processed = 0;
+	uint32_t progress_nr;
+
+	if (!nr)
+		return;
+
+	progress_nr = regions[nr - 1].start + regions[nr - 1].nr;
+
+	if (progress)
+		progress_state = start_progress(the_repository,
+						_("Compressing objects by path"),
+						progress_nr);
+
+	while (nr--)
+		find_deltas_for_region(list,
+				       &regions[start++],
+				       &processed);
+
+	display_progress(progress_state, progress_nr);
+	stop_progress(&progress_state);
+}
+
 static void prepare_pack(int window, int depth)
 {
 	struct object_entry **delta_list;
@@ -3257,6 +3302,10 @@ static void prepare_pack(int window, int depth)
 	if (!to_pack.nr_objects || !window || !depth)
 		return;
 
+	if (path_walk)
+		find_deltas_by_region(to_pack.objects, to_pack.regions,
+				      0, to_pack.nr_regions);
+
 	ALLOC_ARRAY(delta_list, to_pack.nr_objects);
 	nr_deltas = n = 0;
 
@@ -4210,10 +4259,8 @@ static int add_objects_by_path(const char *path,
 			       enum object_type type,
 			       void *data)
 {
-	struct object_entry **delta_list;
 	size_t oe_start = to_pack.nr_objects;
 	size_t oe_end;
-	unsigned int sub_list_size;
 	unsigned int *processed = data;
 
 	/*
@@ -4246,32 +4293,17 @@ static int add_objects_by_path(const char *path,
 	if (oe_end == oe_start || !window)
 		return 0;
 
-	sub_list_size = 0;
-	ALLOC_ARRAY(delta_list, oe_end - oe_start);
-
-	for (size_t i = 0; i < oe_end - oe_start; i++) {
-		struct object_entry *entry = to_pack.objects + oe_start + i;
+	ALLOC_GROW(to_pack.regions,
+		   to_pack.nr_regions + 1,
+		   to_pack.nr_regions_alloc);
 
-		if (!should_attempt_deltas(entry))
-			continue;
-
-		delta_list[sub_list_size++] = entry;
-	}
+	to_pack.regions[to_pack.nr_regions].start = oe_start;
+	to_pack.regions[to_pack.nr_regions].nr = oe_end - oe_start;
+	to_pack.nr_regions++;
 
-	/*
-	 * Find delta bases among this list of objects that all match the same
-	 * path. This causes the delta compression to be interleaved in the
-	 * object walk, which can lead to confusing progress indicators. This is
-	 * also incompatible with threaded delta calculations. In the future,
-	 * consider creating a list of regions in the full to_pack.objects array
-	 * that could be picked up by the threaded delta computation.
-	 */
-	if (sub_list_size && window) {
-		QSORT(delta_list, sub_list_size, type_size_sort);
-		find_deltas(delta_list, &sub_list_size, window, depth, processed);
-	}
+	*processed += oids->nr;
+	display_progress(progress_state, *processed);
 
-	free(delta_list);
 	return 0;
 }
 
diff --git a/pack-objects.h b/pack-objects.h
index d73e3843c92..1b01304f9c4 100644
--- a/pack-objects.h
+++ b/pack-objects.h
@@ -119,11 +119,23 @@ struct object_entry {
 	unsigned ext_base:1; /* delta_idx points outside packlist */
 };
 
+/**
+ * A packing region is a section of the packing_data.objects array
+ * as given by a starting index and a number of elements.
+ */
+struct packing_region {
+	uint32_t start;
+	uint32_t nr;
+};
+
 struct packing_data {
 	struct repository *repo;
 	struct object_entry *objects;
 	uint32_t nr_objects, nr_alloc;
 
+	struct packing_region *regions;
+	uint32_t nr_regions, nr_regions_alloc;
+
 	int32_t *index;
 	uint32_t index_size;
 
diff --git a/t/t5300-pack-object.sh b/t/t5300-pack-object.sh
index 16420d12863..c8df6afd784 100755
--- a/t/t5300-pack-object.sh
+++ b/t/t5300-pack-object.sh
@@ -725,7 +725,9 @@ test_expect_success '--name-hash-version=2 and --write-bitmap-index are incompat
 
 test_expect_success '--path-walk pack everything' '
 	git -C server rev-parse HEAD >in &&
-	git -C server pack-objects --stdout --revs --path-walk <in >out.pack &&
+	GIT_PROGRESS_DELAY=0 git -C server pack-objects \
+		--stdout --revs --path-walk --progress <in >out.pack 2>err &&
+	grep "Compressing objects by path" err &&
 	git -C server index-pack --stdin <out.pack
 '
 
@@ -734,7 +736,9 @@ test_expect_success '--path-walk thin pack' '
 	$(git -C server rev-parse HEAD)
 	^$(git -C server rev-parse HEAD~2)
 	EOF
-	git -C server pack-objects --thin --stdout --revs --path-walk <in >out.pack &&
+	GIT_PROGRESS_DELAY=0 git -C server pack-objects \
+		--thin --stdout --revs --path-walk --progress <in >out.pack 2>err &&
+	grep "Compressing objects by path" err &&
 	git -C server index-pack --fix-thin --stdin <out.pack
 '
 
diff --git a/t/t5530-upload-pack-error.sh b/t/t5530-upload-pack-error.sh
index 8eb6fea839a..558eedf25a4 100755
--- a/t/t5530-upload-pack-error.sh
+++ b/t/t5530-upload-pack-error.sh
@@ -34,12 +34,6 @@ test_expect_success 'upload-pack fails due to error in pack-objects packing' '
 	hexsz=$(test_oid hexsz) &&
 	printf "%04xwant %s\n00000009done\n0000" \
 		$(($hexsz + 10)) $head >input &&
-
-	# The current implementation of path-walk causes a different
-	# error message. This will be changed by a future refactoring.
-	GIT_TEST_PACK_PATH_WALK=0 &&
-	export GIT_TEST_PACK_PATH_WALK &&
-
 	test_must_fail git upload-pack . <input >/dev/null 2>output.err &&
 	test_grep "unable to read" output.err &&
 	test_grep "pack-objects died" output.err
-- 
gitgitgadget


  parent reply	other threads:[~2025-03-10  1:51 UTC|newest]

Thread overview: 73+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-03-10  1:50 [PATCH 00/13] PATH WALK II: Add --path-walk option to 'git pack-objects' Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 01/13] pack-objects: extract should_attempt_deltas() Derrick Stolee via GitGitGadget
2025-03-12 21:01   ` Taylor Blau
2025-03-20 19:48     ` Derrick Stolee
2025-03-10  1:50 ` [PATCH 02/13] pack-objects: add --path-walk option Derrick Stolee via GitGitGadget
2025-03-12 21:14   ` Taylor Blau
2025-03-20 19:46     ` Derrick Stolee
2025-03-10  1:50 ` [PATCH 03/13] pack-objects: update usage to match docs Derrick Stolee via GitGitGadget
2025-03-12 21:14   ` Taylor Blau
2025-03-10  1:50 ` [PATCH 04/13] p5313: add performance tests for --path-walk Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 05/13] pack-objects: introduce GIT_TEST_PACK_PATH_WALK Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 06/13] t5538: add tests to confirm deltas in shallow pushes Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 07/13] repack: add --path-walk option Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 08/13] pack-objects: enable --path-walk via config Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 09/13] scalar: enable path-walk during push " Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` Derrick Stolee via GitGitGadget [this message]
2025-03-12 21:21   ` [PATCH 10/13] pack-objects: refactor path-walk delta phase Taylor Blau
2025-03-20 19:57     ` Derrick Stolee
2025-03-10  1:50 ` [PATCH 11/13] pack-objects: thread the path-based compression Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 12/13] path-walk: add new 'edge_aggressive' option Derrick Stolee via GitGitGadget
2025-03-10  1:50 ` [PATCH 13/13] pack-objects: allow --shallow and --path-walk Derrick Stolee via GitGitGadget
2025-03-10 17:28 ` [PATCH 00/13] PATH WALK II: Add --path-walk option to 'git pack-objects' Junio C Hamano
2025-03-12 20:47   ` Taylor Blau
2025-03-20 20:18     ` Derrick Stolee
2025-03-24 15:22 ` [PATCH v2 " Derrick Stolee via GitGitGadget
2025-03-24 15:22   ` [PATCH v2 01/13] pack-objects: extract should_attempt_deltas() Derrick Stolee via GitGitGadget
2025-05-02 22:48     ` Taylor Blau
2025-03-24 15:22   ` [PATCH v2 02/13] pack-objects: add --path-walk option Derrick Stolee via GitGitGadget
2025-05-02 23:21     ` Taylor Blau
2025-05-06 19:39       ` Derrick Stolee
2025-05-16 15:27         ` Derrick Stolee
2025-03-24 15:22   ` [PATCH v2 03/13] pack-objects: update usage to match docs Derrick Stolee via GitGitGadget
2025-03-24 15:22   ` [PATCH v2 04/13] p5313: add performance tests for --path-walk Derrick Stolee via GitGitGadget
2025-05-02 23:25     ` Taylor Blau
2025-03-24 15:22   ` [PATCH v2 05/13] pack-objects: introduce GIT_TEST_PACK_PATH_WALK Derrick Stolee via GitGitGadget
2025-05-02 23:31     ` Taylor Blau
2025-05-06 19:43       ` Derrick Stolee
2025-03-24 15:22   ` [PATCH v2 06/13] t5538: add tests to confirm deltas in shallow pushes Derrick Stolee via GitGitGadget
2025-05-02 23:34     ` Taylor Blau
2025-05-16 15:32       ` Derrick Stolee
2025-03-24 15:22   ` [PATCH v2 07/13] repack: add --path-walk option Derrick Stolee via GitGitGadget
2025-05-02 23:38     ` Taylor Blau
2025-03-24 15:22   ` [PATCH v2 08/13] pack-objects: enable --path-walk via config Derrick Stolee via GitGitGadget
2025-05-02 23:42     ` Taylor Blau
2025-05-06 19:46       ` Derrick Stolee
2025-05-16 15:41         ` Derrick Stolee
2025-03-24 15:22   ` [PATCH v2 09/13] scalar: enable path-walk during push " Derrick Stolee via GitGitGadget
2025-05-07  0:58     ` Taylor Blau
2025-03-24 15:22   ` [PATCH v2 10/13] pack-objects: refactor path-walk delta phase Derrick Stolee via GitGitGadget
2025-05-07  1:14     ` Taylor Blau
2025-05-16 16:27       ` Derrick Stolee
2025-03-24 15:22   ` [PATCH v2 11/13] pack-objects: thread the path-based compression Derrick Stolee via GitGitGadget
2025-05-07  1:33     ` Taylor Blau
2025-03-24 15:22   ` [PATCH v2 12/13] path-walk: add new 'edge_aggressive' option Derrick Stolee via GitGitGadget
2025-03-24 15:22   ` [PATCH v2 13/13] pack-objects: allow --shallow and --path-walk Derrick Stolee via GitGitGadget
2025-05-02 21:24   ` [PATCH v2 00/13] PATH WALK II: Add --path-walk option to 'git pack-objects' Junio C Hamano
2025-05-02 22:45     ` Taylor Blau
2025-05-02 23:44       ` Taylor Blau
2025-05-07  1:35         ` Taylor Blau
2025-05-16 18:11   ` [PATCH v3 " Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 01/13] pack-objects: extract should_attempt_deltas() Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 02/13] pack-objects: add --path-walk option Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 03/13] pack-objects: update usage to match docs Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 04/13] p5313: add performance tests for --path-walk Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 05/13] pack-objects: introduce GIT_TEST_PACK_PATH_WALK Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 06/13] t5538: add tests to confirm deltas in shallow pushes Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 07/13] repack: add --path-walk option Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 08/13] pack-objects: enable --path-walk via config Derrick Stolee via GitGitGadget
2025-05-16 18:11     ` [PATCH v3 09/13] scalar: enable path-walk during push " Derrick Stolee via GitGitGadget
2025-05-16 18:12     ` [PATCH v3 10/13] pack-objects: refactor path-walk delta phase Derrick Stolee via GitGitGadget
2025-05-16 18:12     ` [PATCH v3 11/13] pack-objects: thread the path-based compression Derrick Stolee via GitGitGadget
2025-05-16 18:12     ` [PATCH v3 12/13] path-walk: add new 'edge_aggressive' option Derrick Stolee via GitGitGadget
2025-05-16 18:12     ` [PATCH v3 13/13] pack-objects: allow --shallow and --path-walk Derrick Stolee via GitGitGadget

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c047fbd7f275ef79695b5e1356075705e6fd7bc8.1741571455.git.gitgitgadget@gmail.com \
    --to=gitgitgadget@gmail.com \
    --cc=christian.couder@gmail.com \
    --cc=git@vger.kernel.org \
    --cc=gitster@pobox.com \
    --cc=johannes.schindelin@gmx.de \
    --cc=johncai86@gmail.com \
    --cc=jonathantanmy@google.com \
    --cc=karthik.188@gmail.com \
    --cc=kristofferhaugsbakk@fastmail.com \
    --cc=me@ttaylorr.com \
    --cc=newren@gmail.com \
    --cc=peff@peff.net \
    --cc=ps@pks.im \
    --cc=stolee@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).