about summary refs log tree commit homepage
diff options
context:
space:
mode:
authorEric Wong <e@80x24.org>2019-05-30 03:59:40 +0000
committerEric Wong <e@80x24.org>2019-05-30 06:33:35 +0000
commit6a2805beea98eb52b8ed866758fd2c416e22fdfb (patch)
tree5ab5626294dc08771cc2e1792cddfde2e39becc6
parenteb5291e92aa8d9d051948c09e949f705b3178e95 (diff)
downloadpublic-inbox-6a2805beea98eb52b8ed866758fd2c416e22fdfb.tar.gz
Creating mm_tmp is an expensive operation with large inboxes
and can be avoided if there are no new messages to process.

Since git-fetch(1) currently lacks an --exit-code option(*),
mirrors will run `public-inbox-index' unconditionally after
fetch, which is an expensive op if it needs to duplicate
a large SQLite DB.

This speeds up the mirror case of:

	git --git-dir=git/$EPOCH.git fetch && public-inbox-index

This reduces the no-op `public-inbox-index' time from over 8s to
~0.5s on a (currently) 7-epoch clone of https://lore.kernel.org/lkml/
on my system.

(*) WIP --exit-code for git-fetch:
    https://public-inbox.org/git/87ftphw7mv.fsf@evledraar.gmail.com/
-rw-r--r--lib/PublicInbox/V2Writable.pm20
1 files changed, 17 insertions, 3 deletions
diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm
index 375f12fa..fd93ac27 100644
--- a/lib/PublicInbox/V2Writable.pm
+++ b/lib/PublicInbox/V2Writable.pm
@@ -900,6 +900,9 @@ sub sync_prepare ($$$) {
                 $pr->("$n\n") if $pr;
                 $regen_max += $n;
         }
+
+        return 0 if (!$regen_max && !keys(%{$self->{unindex_range}}));
+
         # reindex should NOT see new commits anymore, if we do,
         # it's a problem and we need to notice it via die()
         my $pad = length($regen_max) + 1;
@@ -1027,7 +1030,6 @@ sub index_sync {
         return unless defined $latest;
         $self->idx_init($opt); # acquire lock
         my $sync = {
-                mm_tmp => $self->{mm}->tmp_clone,
                 D => {}, # "$mid\0$cid" => $oid
                 unindex_range => {}, # EPOCH => oid_old..oid_new
                 reindex => $opt->{reindex},
@@ -1036,6 +1038,16 @@ sub index_sync {
         $sync->{ranges} = sync_ranges($self, $sync, $epoch_max);
         $sync->{regen} = sync_prepare($self, $sync, $epoch_max);
 
+        if ($sync->{regen}) {
+                # tmp_clone seems to fail if inside a transaction, so
+                # we rollback here (because we opened {mm} for reading)
+                # Note: we do NOT rely on DBI transactions for atomicity;
+                # only for batch performance.
+                $self->{mm}->{dbh}->rollback;
+                $self->{mm}->{dbh}->begin_work;
+                $sync->{mm_tmp} = $self->{mm}->tmp_clone;
+        }
+
         # work backwards through history
         for (my $i = $epoch_max; $i >= 0; $i--) {
                 index_epoch($self, $sync, $i);
@@ -1049,8 +1061,10 @@ sub index_sync {
                 $git->cleanup;
         }
         $self->done;
-        if (my $pr = $sync->{-opt}->{-progress}) {
-                $pr->('all.git '.sprintf($sync->{-regen_fmt}, $sync->{nr}));
+
+        if (my $nr = $sync->{nr}) {
+                my $pr = $sync->{-opt}->{-progress};
+                $pr->('all.git '.sprintf($sync->{-regen_fmt}, $nr)) if $pr;
         }
 
         # reindex does not pick up new changes, so we rerun w/o it: