From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.0 (2014-02-07) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: AS43513 85.31.96.0/21 X-Spam-Status: No, score=-1.8 required=3.0 tests=BAYES_00,RCVD_IN_XBL, RDNS_NONE,SPF_FAIL,SPF_HELO_FAIL,TO_EQ_FM_DOM_SPF_FAIL shortcircuit=no autolearn=no autolearn_force=no version=3.4.0 Received: from 80x24.org (unknown [85.31.101.98]) by dcvr.yhbt.net (Postfix) with ESMTP id 6635F1FE4E for ; Mon, 11 Jul 2016 08:04:51 +0000 (UTC) From: Eric Wong To: spew@80x24.org Subject: [PATCH] wip-reindex-support Date: Mon, 11 Jul 2016 08:04:47 +0000 Message-Id: <20160711080447.31288-1-e@80x24.org> List-Id: --- lib/PublicInbox/SearchIdx.pm | 67 ++++++++++++++++++++++++++++++++++++++++---- script/public-inbox-index | 14 ++++++++- 2 files changed, 74 insertions(+), 7 deletions(-) diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index c2bf9a2..8260823 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -58,8 +58,9 @@ sub add_message { my $db = $self->{xdb}; my $doc_id; + my $old_tid; my $mid = mid_clean(mid_mime($mime)); - my $was_ghost = 0; + my ($was_ghost, $was_mail); my $ct_msg = $mime->header('Content-Type') || 'text/plain'; eval { @@ -81,8 +82,20 @@ sub add_message { }; # probably does not exist: - eval { $doc->remove_term($type . 'mail') }; - $doc->add_term($type . 'mail'); + eval { + $doc->remove_term($type . 'mail'); + $was_mail = 1; + }; + + if ($was_mail) { + $old_tid = $smsg->thread_id; + $db->delete_document($smsg->doc_id); + $smsg = PublicInbox::SearchMsg->new($mime); + $doc = $smsg->{doc}; + $doc->add_term(xpfx('mid') . $mid); + } else { + $doc->add_term($type . 'mail'); + } } else { $smsg = PublicInbox::SearchMsg->new($mime); $doc = $smsg->{doc}; @@ -154,7 +167,7 @@ sub add_message { $doc->set_data($smsg->to_doc_data); $db->replace_document($doc_id, $doc); } else { - $self->link_message($smsg); + $self->link_message($smsg, $old_tid); $doc->set_data($smsg->to_doc_data); $doc_id = $db->add_document($doc); } @@ -323,6 +336,11 @@ sub index_sync { $self->with_umask(sub { $self->_index_sync($head) }); } +sub reindex { + my ($self, $head) = @_; + $self->with_umask(sub { $self->_reindex($head) }); +} + sub rlog { my ($self, $range, $add_cb, $del_cb) = @_; my $hex = '[a-f0-9]'; @@ -349,6 +367,37 @@ sub rlog { $latest; } +sub _reindex { + my ($self, $head) = @_; + my $db = $self->{xdb}; + my $mm = $self->{mm} = eval { + require PublicInbox::Msgmap; + PublicInbox::Msgmap->new($self->{git_dir}, 1); + }; + $db->begin_transaction; + $head ||= $db->get_metadata('last_commit') || 'HEAD'; + my $git = PublicInbox::Git->new($self->{git_dir}); + my $ls = $git->popen(qw(ls-tree -r -z), $head); + my $nr = 0; + local $/ = "\0"; + my $h = '[a-f0-9]'; + while (defined(my $l = <$ls>)) { + $l =~ m!\A100644 blob (${h}{40})\t${h}{2}/${h}{38}\0\z! or next; + my $bytes; + my $mime = do_cat_mail($git, $1, \$bytes) or next; + my $num = $mm->num_for(mid_clean(mid_mime($mime))); + index_blob($self, $git, $mime, $bytes, $num); + ++$nr; + if (!($nr % 8)) { + print STDERR "\r$nr" if -t STDERR; + $db->commit_transaction; + $db->begin_transaction; + } + } + $db->commit_transaction; + print STDERR "\rdone\n" if -t STDERR; +} + # indexes all unindexed messages sub _index_sync { my ($self, $head) = @_; @@ -375,13 +424,19 @@ sub _index_sync { $mm->last_commit($lx); } } else { - # dumb case, msgmap and xapian are out-of-sync - # do not care for performance: + # Uncommon case, msgmap and xapian are out-of-sync + # do not care for performance (but git is fast :>) + # this happens if we have to reindex Xapian since + # msgmap is a frozen format and our Xapian format + # is evolving. my $r = $lm eq '' ? $head : "$lm..$head"; + + # first, ensure msgmap is up-to-date: $lm = $self->rlog($r, *index_mm, *unindex_mm); $mm->{dbh}->commit; $mm->last_commit($lm) if defined $lm; + # now deal with Xapian $lx = $self->rlog($range, *index_mm2, *unindex_mm2); $db->set_metadata('last_commit', $lx) if defined $lx; } diff --git a/script/public-inbox-index b/script/public-inbox-index index 46584c1..1297f06 100755 --- a/script/public-inbox-index +++ b/script/public-inbox-index @@ -8,6 +8,7 @@ use strict; use warnings; +use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev); my $usage = "public-inbox-index GIT_DIR"; use PublicInbox::Config; eval { require PublicInbox::SearchIdx }; @@ -15,6 +16,13 @@ if ($@) { print STDERR "Search::Xapian required for $0\n"; exit 1; } + +my $reindex; +my %opts = ( + '--reindex' => \$reindex, +); +GetOptions(%opts) or die "bad command-line args\n$usage"; + my @dirs; sub resolve_git_dir { @@ -59,5 +67,9 @@ sub index_dir { -d $git_dir or die "$git_dir does not appear to be a git repository\n"; my $s = PublicInbox::SearchIdx->new($git_dir, 1); - $s->index_sync; + if ($reindex) { + $s->reindex; + } else { + $s->index_sync; + } }