From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS5577 212.117.160.0/19 X-Spam-Status: No, score=-1.9 required=3.0 tests=AWL,BAYES_00,LOTS_OF_MONEY, RCVD_IN_MSPIKE_BL,RCVD_IN_MSPIKE_ZBI,RCVD_IN_XBL,SPF_FAIL,SPF_HELO_FAIL, TO_EQ_FM_DOM_SPF_FAIL shortcircuit=no autolearn=no autolearn_force=no version=3.4.0 Received: from 80x24.org (2.tor.exit.babylon.network [212.117.180.21]) by dcvr.yhbt.net (Postfix) with ESMTP id BCD9420195 for ; Wed, 13 Jul 2016 02:00:30 +0000 (UTC) From: Eric Wong To: spew@80x24.org Subject: [PATCH] wip-reindex-support v3 Date: Wed, 13 Jul 2016 02:00:17 +0000 Message-Id: <20160713020017.21665-1-e@80x24.org> List-Id: --- lib/PublicInbox/SearchIdx.pm | 90 +++++++++++++++++++++++++++----------------- script/public-inbox-index | 12 +++++- 2 files changed, 67 insertions(+), 35 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c2bf9a2..1b10041 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -57,40 +57,24 @@ sub add_message { my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object my $db = $self->{xdb}; - my $doc_id; + my ($doc_id, $old_tid); my $mid = mid_clean(mid_mime($mime)); - my $was_ghost = 0; my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { die 'Message-ID too long' if length($mid) > MAX_MID_SIZE; my $smsg = $self->lookup_message($mid); - my $doc; - if ($smsg) { - $smsg->ensure_metadata; # convert a ghost to a regular message # it will also clobber any existing regular message - $smsg->mime($mime); - $doc = $smsg->{doc}; - - my $type = xpfx('type'); - eval { - $doc->remove_term($type . 'ghost'); - $was_ghost = 1; - }; - - # probably does not exist: - eval { $doc->remove_term($type . 'mail') }; - $doc->add_term($type . 'mail'); - } else { - $smsg = PublicInbox::SearchMsg->new($mime); - $doc = $smsg->{doc}; - $doc->add_term(xpfx('mid') . $mid); + $doc_id = $smsg->doc_id; + $old_tid = $smsg->thread_id; } + $smsg = PublicInbox::SearchMsg->new($mime); + my $doc = $smsg->{doc}; + $doc->add_term(xpfx('mid') . $mid); my $subj = $smsg->subject; - if ($subj ne '') { my $path = $self->subject_path($subj); $doc->add_term(xpfx('path') . id_compress($path)); @@ -148,14 +132,11 @@ sub add_message { } }); - if ($was_ghost) { - $doc_id = $smsg->doc_id; - $self->link_message($smsg, $smsg->thread_id); - $doc->set_data($smsg->to_doc_data); + $self->link_message($smsg, $old_tid); + $doc->set_data($smsg->to_doc_data); + if (defined $doc_id) { $db->replace_document($doc_id, $doc); } else { - $self->link_message($smsg); - $doc->set_data($smsg->to_doc_data); $doc_id = $db->add_document($doc); } }; @@ -252,9 +233,7 @@ sub link_message { # the rest of the refs should point to this tid: foreach $ref (@refs) { my $ptid = $self->_resolve_mid_to_tid($ref); - if ($tid ne $ptid) { - $self->merge_threads($tid, $ptid); - } + $self->merge_threads($tid, $ptid); } } else { $tid = $self->next_thread_id; @@ -323,6 +302,11 @@ sub index_sync { $self->with_umask(sub { $self->_index_sync($head) }); } +sub reindex { + my ($self, $head) = @_; + $self->with_umask(sub { $self->_reindex($head) }); +} + sub rlog { my ($self, $range, $add_cb, $del_cb) = @_; my $hex = '[a-f0-9]'; @@ -330,7 +314,7 @@ sub rlog { my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!; my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!; my $git = PublicInbox::Git->new($self->{git_dir}); - my $log = $git->popen(qw/log --reverse --no-notes --no-color + my $log = $git->popen(qw/log --no-notes --no-color --raw -r --no-abbrev/, $range); my $latest; my $bytes; @@ -349,6 +333,37 @@ sub rlog { $latest; } +sub _reindex { + my ($self, $head) = @_; + my $db = $self->{xdb}; + my $mm = $self->{mm} = eval { + require PublicInbox::Msgmap; + PublicInbox::Msgmap->new($self->{git_dir}, 1); + }; + $db->begin_transaction; + $head ||= $db->get_metadata('last_commit') || 'HEAD'; + my $git = PublicInbox::Git->new($self->{git_dir}); + my $ls = $git->popen(qw(ls-tree -r -z), $head); + my $nr = 0; + local $/ = "\0"; + my $h = '[a-f0-9]'; + while (defined(my $l = <$ls>)) { + $l =~ m!\A100644 blob (${h}{40})\t${h}{2}/${h}{38}\0\z! or next; + my $bytes; + my $mime = do_cat_mail($git, $1, \$bytes) or next; + my $num = $mm->num_for(mid_clean(mid_mime($mime))); + index_blob($self, $git, $mime, $bytes, $num); + ++$nr; + if (!($nr % 8)) { + print STDERR "\r$nr" if -t STDERR; + $db->commit_transaction; + $db->begin_transaction; + } + } + $db->commit_transaction; + print STDERR "\rdone\n" if -t STDERR; +} + # indexes all unindexed messages sub _index_sync { my ($self, $head) = @_; @@ -375,13 +390,19 @@ sub _index_sync { $mm->last_commit($lx); } } else { - # dumb case, msgmap and xapian are out-of-sync - # do not care for performance: + # Uncommon case, msgmap and xapian are out-of-sync + # do not care for performance (but git is fast :>) + # this happens if we have to reindex Xapian since + # msgmap is a frozen format and our Xapian format + # is evolving. my $r = $lm eq '' ? $head : "$lm..$head"; + + # first, ensure msgmap is up-to-date: $lm = $self->rlog($r, *index_mm, *unindex_mm); $mm->{dbh}->commit; $mm->last_commit($lm) if defined $lm; + # now deal with Xapian $lx = $self->rlog($range, *index_mm2, *unindex_mm2); $db->set_metadata('last_commit', $lx) if defined $lx; } @@ -423,6 +444,7 @@ sub create_ghost { sub merge_threads { my ($self, $winner_tid, $loser_tid) = @_; + return if $winner_tid == $loser_tid; my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid); my $thread_pfx = xpfx('thread'); my $db = $self->{xdb}; diff --git a/script/public-inbox-index b/script/public-inbox-index index 46584c1..a9df1ca 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -8,6 +8,7 @@ use strict; use warnings; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); my $usage = "public-inbox-index GIT_DIR"; use PublicInbox::Config; eval { require PublicInbox::SearchIdx }; @@ -15,6 +16,11 @@ if ($@) { print STDERR "Search::Xapian required for $0\n"; exit 1; } + +my $reindex; +my %opts = ( '--reindex' => \$reindex ); +GetOptions(%opts) or die "bad command-line args\n$usage"; + my @dirs; sub resolve_git_dir { @@ -59,5 +65,9 @@ sub index_dir { -d $git_dir or die "$git_dir does not appear to be a git repository\n"; my $s = PublicInbox::SearchIdx->new($git_dir, 1); - $s->index_sync; + if ($reindex) { + $s->reindex; + } else { + $s->index_sync; + } } -- EW