* [PATCH] wip-reindex-support v3
@ 2016-07-13 2:00 Eric Wong
0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2016-07-13 2:00 UTC (permalink / raw)
To: spew
---
lib/PublicInbox/SearchIdx.pm | 90 +++++++++++++++++++++++++++-----------------
script/public-inbox-index | 12 +++++-
2 files changed, 67 insertions(+), 35 deletions(-)
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c2bf9a2..1b10041 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -57,40 +57,24 @@ sub add_message {
my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object
my $db = $self->{xdb};
- my $doc_id;
+ my ($doc_id, $old_tid);
my $mid = mid_clean(mid_mime($mime));
- my $was_ghost = 0;
my $ct_msg = $mime->header('Content-Type') || 'text/plain';
eval {
die 'Message-ID too long' if length($mid) > MAX_MID_SIZE;
my $smsg = $self->lookup_message($mid);
- my $doc;
-
if ($smsg) {
- $smsg->ensure_metadata;
# convert a ghost to a regular message
# it will also clobber any existing regular message
- $smsg->mime($mime);
- $doc = $smsg->{doc};
-
- my $type = xpfx('type');
- eval {
- $doc->remove_term($type . 'ghost');
- $was_ghost = 1;
- };
-
- # probably does not exist:
- eval { $doc->remove_term($type . 'mail') };
- $doc->add_term($type . 'mail');
- } else {
- $smsg = PublicInbox::SearchMsg->new($mime);
- $doc = $smsg->{doc};
- $doc->add_term(xpfx('mid') . $mid);
+ $doc_id = $smsg->doc_id;
+ $old_tid = $smsg->thread_id;
}
+ $smsg = PublicInbox::SearchMsg->new($mime);
+ my $doc = $smsg->{doc};
+ $doc->add_term(xpfx('mid') . $mid);
my $subj = $smsg->subject;
-
if ($subj ne '') {
my $path = $self->subject_path($subj);
$doc->add_term(xpfx('path') . id_compress($path));
@@ -148,14 +132,11 @@ sub add_message {
}
});
- if ($was_ghost) {
- $doc_id = $smsg->doc_id;
- $self->link_message($smsg, $smsg->thread_id);
- $doc->set_data($smsg->to_doc_data);
+ $self->link_message($smsg, $old_tid);
+ $doc->set_data($smsg->to_doc_data);
+ if (defined $doc_id) {
$db->replace_document($doc_id, $doc);
} else {
- $self->link_message($smsg);
- $doc->set_data($smsg->to_doc_data);
$doc_id = $db->add_document($doc);
}
};
@@ -252,9 +233,7 @@ sub link_message {
# the rest of the refs should point to this tid:
foreach $ref (@refs) {
my $ptid = $self->_resolve_mid_to_tid($ref);
- if ($tid ne $ptid) {
- $self->merge_threads($tid, $ptid);
- }
+ $self->merge_threads($tid, $ptid);
}
} else {
$tid = $self->next_thread_id;
@@ -323,6 +302,11 @@ sub index_sync {
$self->with_umask(sub { $self->_index_sync($head) });
}
+sub reindex {
+ my ($self, $head) = @_;
+ $self->with_umask(sub { $self->_reindex($head) });
+}
+
sub rlog {
my ($self, $range, $add_cb, $del_cb) = @_;
my $hex = '[a-f0-9]';
@@ -330,7 +314,7 @@ sub rlog {
my $addmsg = qr!^:000000 100644 \S+ ($h40) A\t${hex}{2}/${hex}{38}$!;
my $delmsg = qr!^:100644 000000 ($h40) \S+ D\t${hex}{2}/${hex}{38}$!;
my $git = PublicInbox::Git->new($self->{git_dir});
- my $log = $git->popen(qw/log --reverse --no-notes --no-color
+ my $log = $git->popen(qw/log --no-notes --no-color
--raw -r --no-abbrev/, $range);
my $latest;
my $bytes;
@@ -349,6 +333,37 @@ sub rlog {
$latest;
}
+sub _reindex {
+ my ($self, $head) = @_;
+ my $db = $self->{xdb};
+ my $mm = $self->{mm} = eval {
+ require PublicInbox::Msgmap;
+ PublicInbox::Msgmap->new($self->{git_dir}, 1);
+ };
+ $db->begin_transaction;
+ $head ||= $db->get_metadata('last_commit') || 'HEAD';
+ my $git = PublicInbox::Git->new($self->{git_dir});
+ my $ls = $git->popen(qw(ls-tree -r -z), $head);
+ my $nr = 0;
+ local $/ = "\0";
+ my $h = '[a-f0-9]';
+ while (defined(my $l = <$ls>)) {
+ $l =~ m!\A100644 blob (${h}{40})\t${h}{2}/${h}{38}\0\z! or next;
+ my $bytes;
+ my $mime = do_cat_mail($git, $1, \$bytes) or next;
+ my $num = $mm->num_for(mid_clean(mid_mime($mime)));
+ index_blob($self, $git, $mime, $bytes, $num);
+ ++$nr;
+ if (!($nr % 8)) {
+ print STDERR "\r$nr" if -t STDERR;
+ $db->commit_transaction;
+ $db->begin_transaction;
+ }
+ }
+ $db->commit_transaction;
+ print STDERR "\rdone\n" if -t STDERR;
+}
+
# indexes all unindexed messages
sub _index_sync {
my ($self, $head) = @_;
@@ -375,13 +390,19 @@ sub _index_sync {
$mm->last_commit($lx);
}
} else {
- # dumb case, msgmap and xapian are out-of-sync
- # do not care for performance:
+ # Uncommon case, msgmap and xapian are out-of-sync
+ # do not care for performance (but git is fast :>)
+ # this happens if we have to reindex Xapian since
+ # msgmap is a frozen format and our Xapian format
+ # is evolving.
my $r = $lm eq '' ? $head : "$lm..$head";
+
+ # first, ensure msgmap is up-to-date:
$lm = $self->rlog($r, *index_mm, *unindex_mm);
$mm->{dbh}->commit;
$mm->last_commit($lm) if defined $lm;
+ # now deal with Xapian
$lx = $self->rlog($range, *index_mm2, *unindex_mm2);
$db->set_metadata('last_commit', $lx) if defined $lx;
}
@@ -423,6 +444,7 @@ sub create_ghost {
sub merge_threads {
my ($self, $winner_tid, $loser_tid) = @_;
+ return if $winner_tid == $loser_tid;
my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
my $thread_pfx = xpfx('thread');
my $db = $self->{xdb};
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 46584c1..a9df1ca 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -8,6 +8,7 @@
use strict;
use warnings;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
my $usage = "public-inbox-index GIT_DIR";
use PublicInbox::Config;
eval { require PublicInbox::SearchIdx };
@@ -15,6 +16,11 @@ if ($@) {
print STDERR "Search::Xapian required for $0\n";
exit 1;
}
+
+my $reindex;
+my %opts = ( '--reindex' => \$reindex );
+GetOptions(%opts) or die "bad command-line args\n$usage";
+
my @dirs;
sub resolve_git_dir {
@@ -59,5 +65,9 @@ sub index_dir {
-d $git_dir or die "$git_dir does not appear to be a git repository\n";
my $s = PublicInbox::SearchIdx->new($git_dir, 1);
- $s->index_sync;
+ if ($reindex) {
+ $s->reindex;
+ } else {
+ $s->index_sync;
+ }
}
--
EW
^ permalink raw reply related [flat|nested] only message in thread
only message in thread, other threads:[~2016-07-13 2:00 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-13 2:00 [PATCH] wip-reindex-support v3 Eric Wong
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).