From 13ed8aebfc43bd99571d589743333f36bff5567f Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 30 May 2019 03:59:38 +0000 Subject: v2writable: split off unindex_range mapping It'll make it easier to detect if we have anything to unindex and run git-log on, at all. --- lib/PublicInbox/V2Writable.pm | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 6b011712..df8cfb45 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -861,7 +861,7 @@ Rewritten history? (in $git->{git_dir}) reindexing $git->{git_dir} starting at $range - $sync->{"unindex-range.$i"} = "$base..$cur"; + $sync->{unindex_range}->{$i} = "$base..$cur"; } $range; } @@ -993,6 +993,7 @@ sub index_sync { my $sync = { mm_tmp => $self->{mm}->tmp_clone, D => {}, # "$mid\0$cid" => $oid + unindex_range => {}, # EPOCH => oid_old..oid_new reindex => $opt->{reindex}, -opt => $opt }; @@ -1009,7 +1010,7 @@ sub index_sync { -d $git_dir or next; # missing parts are fine fill_alternates($self, $i); my $git = PublicInbox::Git->new($git_dir); - my $unindex_range = delete $sync->{"unindex-range.$i"}; + my $unindex_range = delete $sync->{unindex_range}->{$i}; unindex($self, $sync, $git, $unindex_range) if $unindex_range; defined(my $range = $sync->{ranges}->[$i]) or next; $pr->("$i.git indexing $range\n") if $pr; -- cgit v1.2.3-24-ge0c7 From eb5291e92aa8d9d051948c09e949f705b3178e95 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 30 May 2019 03:59:39 +0000 Subject: v2writable: hoist out index_epoch sub This will make future changes easier-to-follow. --- lib/PublicInbox/V2Writable.pm | 65 ++++++++++++++++++++++++------------------- 1 file changed, 37 insertions(+), 28 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index df8cfb45..375f12fa 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -981,6 +981,42 @@ sub sync_ranges ($$$) { $ranges; } +sub index_epoch ($$$) { + my ($self, $sync, $i) = @_; + + my $git_dir = git_dir_n($self, $i); + die 'BUG: already reindexing!' if $self->{reindex_pipe}; + -d $git_dir or return; # missing parts are fine + fill_alternates($self, $i); + my $git = PublicInbox::Git->new($git_dir); + if (my $unindex_range = delete $sync->{unindex_range}->{$i}) { + unindex($self, $sync, $git, $unindex_range); + } + defined(my $range = $sync->{ranges}->[$i]) or return; + if (my $pr = $sync->{-opt}->{-progress}) { + $pr->("$i.git indexing $range\n"); + } + + my @cmd = qw(log --raw -r --pretty=tformat:%H + --no-notes --no-color --no-abbrev --no-renames); + my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range); + my $cmt; + while (<$fh>) { + chomp; + $self->{current_info} = "$i.git $_"; + if (/\A$x40$/o && !defined($cmt)) { + $cmt = $_; + } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { + reindex_oid($self, $sync, $git, $1); + } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { + mark_deleted($self, $sync, $git, $1); + } + } + $fh = undef; + delete $self->{reindex_pipe}; + update_last_commit($self, $git, $i, $cmt) if defined $cmt; +} + # public, called by public-inbox-index sub index_sync { my ($self, $opt) = @_; @@ -1000,36 +1036,9 @@ sub index_sync { $sync->{ranges} = sync_ranges($self, $sync, $epoch_max); $sync->{regen} = sync_prepare($self, $sync, $epoch_max); - my @cmd = qw(log --raw -r --pretty=tformat:%H - --no-notes --no-color --no-abbrev --no-renames); - # work backwards through history for (my $i = $epoch_max; $i >= 0; $i--) { - my $git_dir = git_dir_n($self, $i); - die 'BUG: already reindexing!' if $self->{reindex_pipe}; - -d $git_dir or next; # missing parts are fine - fill_alternates($self, $i); - my $git = PublicInbox::Git->new($git_dir); - my $unindex_range = delete $sync->{unindex_range}->{$i}; - unindex($self, $sync, $git, $unindex_range) if $unindex_range; - defined(my $range = $sync->{ranges}->[$i]) or next; - $pr->("$i.git indexing $range\n") if $pr; - my $fh = $self->{reindex_pipe} = $git->popen(@cmd, $range); - my $cmt; - while (<$fh>) { - chomp; - $self->{current_info} = "$i.git $_"; - if (/\A$x40$/o && !defined($cmt)) { - $cmt = $_; - } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\tm$/o) { - reindex_oid($self, $sync, $git, $1); - } elsif (/\A:\d{6} 100644 $x40 ($x40) [AM]\td$/o) { - mark_deleted($self, $sync, $git, $1); - } - } - $fh = undef; - delete $self->{reindex_pipe}; - update_last_commit($self, $git, $i, $cmt) if defined $cmt; + index_epoch($self, $sync, $i); } # unindex is required for leftovers if "deletes" affect messages -- cgit v1.2.3-24-ge0c7 From 6a2805beea98eb52b8ed866758fd2c416e22fdfb Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 30 May 2019 03:59:40 +0000 Subject: v2writable: avoid mm_tmp creation without regen Creating mm_tmp is an expensive operation with large inboxes and can be avoided if there are no new messages to process. Since git-fetch(1) currently lacks an --exit-code option(*), mirrors will run `public-inbox-index' unconditionally after fetch, which is an expensive op if it needs to duplicate a large SQLite DB. This speeds up the mirror case of: git --git-dir=git/$EPOCH.git fetch && public-inbox-index This reduces the no-op `public-inbox-index' time from over 8s to ~0.5s on a (currently) 7-epoch clone of https://lore.kernel.org/lkml/ on my system. (*) WIP --exit-code for git-fetch: https://public-inbox.org/git/87ftphw7mv.fsf@evledraar.gmail.com/ --- lib/PublicInbox/V2Writable.pm | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index 375f12fa..fd93ac27 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -900,6 +900,9 @@ sub sync_prepare ($$$) { $pr->("$n\n") if $pr; $regen_max += $n; } + + return 0 if (!$regen_max && !keys(%{$self->{unindex_range}})); + # reindex should NOT see new commits anymore, if we do, # it's a problem and we need to notice it via die() my $pad = length($regen_max) + 1; @@ -1027,7 +1030,6 @@ sub index_sync { return unless defined $latest; $self->idx_init($opt); # acquire lock my $sync = { - mm_tmp => $self->{mm}->tmp_clone, D => {}, # "$mid\0$cid" => $oid unindex_range => {}, # EPOCH => oid_old..oid_new reindex => $opt->{reindex}, @@ -1036,6 +1038,16 @@ sub index_sync { $sync->{ranges} = sync_ranges($self, $sync, $epoch_max); $sync->{regen} = sync_prepare($self, $sync, $epoch_max); + if ($sync->{regen}) { + # tmp_clone seems to fail if inside a transaction, so + # we rollback here (because we opened {mm} for reading) + # Note: we do NOT rely on DBI transactions for atomicity; + # only for batch performance. + $self->{mm}->{dbh}->rollback; + $self->{mm}->{dbh}->begin_work; + $sync->{mm_tmp} = $self->{mm}->tmp_clone; + } + # work backwards through history for (my $i = $epoch_max; $i >= 0; $i--) { index_epoch($self, $sync, $i); @@ -1049,8 +1061,10 @@ sub index_sync { $git->cleanup; } $self->done; - if (my $pr = $sync->{-opt}->{-progress}) { - $pr->('all.git '.sprintf($sync->{-regen_fmt}, $sync->{nr})); + + if (my $nr = $sync->{nr}) { + my $pr = $sync->{-opt}->{-progress}; + $pr->('all.git '.sprintf($sync->{-regen_fmt}, $nr)) if $pr; } # reindex does not pick up new changes, so we rerun w/o it: -- cgit v1.2.3-24-ge0c7 From e261bedfae7f2eb192109b5fdd4113440fee7e22 Mon Sep 17 00:00:00 2001 From: Eric Wong Date: Thu, 30 May 2019 06:35:05 +0000 Subject: v2writable: short-circuit is_ancestor check on equality We don't need to use git to check ancestry if object IDs match on a string comparison. This saves 100ms or so and brings down the ~0.5s no-op time on lore.kernel.org/lkml down to ~0.4s. --- lib/PublicInbox/V2Writable.pm | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/lib/PublicInbox/V2Writable.pm b/lib/PublicInbox/V2Writable.pm index fd93ac27..76844cd4 100644 --- a/lib/PublicInbox/V2Writable.pm +++ b/lib/PublicInbox/V2Writable.pm @@ -831,6 +831,12 @@ sub log_range ($$$$$) { return $tip; # all of it }; + # fast equality check to avoid (v)fork+execve overhead + if ($cur eq $tip) { + $sync->{ranges}->[$i] = undef; + return; + } + my $range = "$cur..$tip"; $pr->("$i.git checking contiguity... ") if $pr; if (is_ancestor($git, $cur, $tip)) { # common case -- cgit v1.2.3-24-ge0c7