dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH] search: support reindexing
@ 2016-07-28 23:09 Eric Wong
  0 siblings, 0 replies; only message in thread
From: Eric Wong @ 2016-07-28 23:09 UTC (permalink / raw)
  To: spew

---
 lib/PublicInbox/Msgmap.pm    | 35 +++++++++++-----------
 lib/PublicInbox/SearchIdx.pm | 71 ++++++++++++++++----------------------------
 script/public-inbox-index    |  8 ++++-
 3 files changed, 50 insertions(+), 64 deletions(-)

diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 8fe17a9..ca78756 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -32,8 +32,16 @@ sub new {
 	my $self = bless { dbh => $dbh }, $class;
 
 	if ($writable) {
-		create_tables($dbh);
-		$self->created_at(time) unless $self->created_at;
+		$dbh->begin_work;
+		eval {
+			create_tables($dbh);
+			$self->created_at(time) unless $self->created_at;
+			$dbh->commit;
+		};
+		if (my $err = $@) {
+			$dbh->rollback;
+			die $err;
+		}
 	}
 	$self;
 }
@@ -51,22 +59,13 @@ sub meta_accessor {
 	defined $value or
 		return $dbh->selectrow_array(meta_select, undef, $key);
 
-	$dbh->begin_work;
-	eval {
-		$prev = $dbh->selectrow_array(meta_select, undef, $key);
-
-		if (defined $prev) {
-			$dbh->do(meta_update, undef, $value, $key);
-		} else {
-			$dbh->do(meta_insert, undef, $key, $value);
-		}
-		$dbh->commit;
-	};
-	my $err = $@;
-	return $prev unless $err;
-
-	$dbh->rollback;
-	die $err;
+	$prev = $dbh->selectrow_array(meta_select, undef, $key);
+	if (defined $prev) {
+		$dbh->do(meta_update, undef, $value, $key);
+	} else {
+		$dbh->do(meta_insert, undef, $key, $value);
+	}
+	$prev;
 }
 
 sub last_commit {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c2bf9a2..5a4ff22 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -57,40 +57,24 @@ sub add_message {
 	my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object
 	my $db = $self->{xdb};
 
-	my $doc_id;
+	my ($doc_id, $old_tid);
 	my $mid = mid_clean(mid_mime($mime));
-	my $was_ghost = 0;
 	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
 
 	eval {
 		die 'Message-ID too long' if length($mid) > MAX_MID_SIZE;
 		my $smsg = $self->lookup_message($mid);
-		my $doc;
-
 		if ($smsg) {
-			$smsg->ensure_metadata;
 			# convert a ghost to a regular message
 			# it will also clobber any existing regular message
-			$smsg->mime($mime);
-			$doc = $smsg->{doc};
-
-			my $type = xpfx('type');
-			eval {
-				$doc->remove_term($type . 'ghost');
-				$was_ghost = 1;
-			};
-
-			# probably does not exist:
-			eval { $doc->remove_term($type . 'mail') };
-			$doc->add_term($type . 'mail');
-		}  else {
-			$smsg = PublicInbox::SearchMsg->new($mime);
-			$doc = $smsg->{doc};
-			$doc->add_term(xpfx('mid') . $mid);
+			$doc_id = $smsg->doc_id;
+			$old_tid = $smsg->thread_id;
 		}
+		$smsg = PublicInbox::SearchMsg->new($mime);
+		my $doc = $smsg->{doc};
+		$doc->add_term(xpfx('mid') . $mid);
 
 		my $subj = $smsg->subject;
-
 		if ($subj ne '') {
 			my $path = $self->subject_path($subj);
 			$doc->add_term(xpfx('path') . id_compress($path));
@@ -148,14 +132,11 @@ sub add_message {
 			}
 		});
 
-		if ($was_ghost) {
-			$doc_id = $smsg->doc_id;
-			$self->link_message($smsg, $smsg->thread_id);
-			$doc->set_data($smsg->to_doc_data);
+		link_message($self, $smsg, $old_tid);
+		$doc->set_data($smsg->to_doc_data);
+		if (defined $doc_id) {
 			$db->replace_document($doc_id, $doc);
 		} else {
-			$self->link_message($smsg);
-			$doc->set_data($smsg->to_doc_data);
 			$doc_id = $db->add_document($doc);
 		}
 	};
@@ -252,9 +233,7 @@ sub link_message {
 		# the rest of the refs should point to this tid:
 		foreach $ref (@refs) {
 			my $ptid = $self->_resolve_mid_to_tid($ref);
-			if ($tid ne $ptid) {
-				$self->merge_threads($tid, $ptid);
-			}
+			merge_threads($self, $tid, $ptid);
 		}
 	} else {
 		$tid = $self->next_thread_id;
@@ -319,8 +298,8 @@ sub do_cat_mail {
 }
 
 sub index_sync {
-	my ($self, $head) = @_;
-	$self->with_umask(sub { $self->_index_sync($head) });
+	my ($self, $opts) = @_;
+	with_umask($self, sub { $self->_index_sync($opts) });
 }
 
 sub rlog {
@@ -351,9 +330,9 @@ sub rlog {
 
 # indexes all unindexed messages
 sub _index_sync {
-	my ($self, $head) = @_;
+	my ($self, $opts) = @_;
 	my $db = $self->{xdb};
-	$head ||= 'HEAD';
+	my $head = 'HEAD';
 	my $mm = $self->{mm} = eval {
 		require PublicInbox::Msgmap;
 		PublicInbox::Msgmap->new($self->{git_dir}, 1);
@@ -369,19 +348,25 @@ sub _index_sync {
 			# Common case is the indexes are synced,
 			# we only need to run git-log once:
 			$lx = $self->rlog($range, *index_both, *unindex_both);
-			$mm->{dbh}->commit;
 			if (defined $lx) {
 				$db->set_metadata('last_commit', $lx);
 				$mm->last_commit($lx);
 			}
+			$mm->{dbh}->commit;
 		} else {
-			# dumb case, msgmap and xapian are out-of-sync
-			# do not care for performance:
+			# Uncommon case, msgmap and xapian are out-of-sync
+			# do not care for performance (but git is fast :>)
+			# this happens if we have to reindex Xapian since
+			# msgmap is a frozen format and our Xapian format
+			# is evolving.
 			my $r = $lm eq '' ? $head : "$lm..$head";
+
+			# first, ensure msgmap is up-to-date:
 			$lm = $self->rlog($r, *index_mm, *unindex_mm);
-			$mm->{dbh}->commit;
 			$mm->last_commit($lm) if defined $lm;
+			$mm->{dbh}->commit;
 
+			# now deal with Xapian
 			$lx = $self->rlog($range, *index_mm2, *unindex_mm2);
 			$db->set_metadata('last_commit', $lx) if defined $lx;
 		}
@@ -390,12 +375,7 @@ sub _index_sync {
 		$lx = $self->rlog($range, *index_blob, *unindex_blob);
 		$db->set_metadata('last_commit', $lx) if defined $lx;
 	}
-	if ($@) {
-		$db->cancel_transaction;
-		$mm->{dbh}->rollback if $mm;
-	} else {
-		$db->commit_transaction;
-	}
+	$db->commit_transaction;
 }
 
 # this will create a ghost as necessary
@@ -423,6 +403,7 @@ sub create_ghost {
 
 sub merge_threads {
 	my ($self, $winner_tid, $loser_tid) = @_;
+	return if $winner_tid == $loser_tid;
 	my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
 	my $thread_pfx = xpfx('thread');
 	my $db = $self->{xdb};
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 46584c1..16c6d45 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -8,6 +8,7 @@
 
 use strict;
 use warnings;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $usage = "public-inbox-index GIT_DIR";
 use PublicInbox::Config;
 eval { require PublicInbox::SearchIdx };
@@ -15,6 +16,11 @@ if ($@) {
 	print STDERR "Search::Xapian required for $0\n";
 	exit 1;
 }
+
+my $reindex;
+my %opts = ( '--reindex' => \$reindex );
+GetOptions(%opts) or die "bad command-line args\n$usage";
+
 my @dirs;
 
 sub resolve_git_dir {
@@ -59,5 +65,5 @@ sub index_dir {
 	-d $git_dir or die "$git_dir does not appear to be a git repository\n";
 
 	my $s = PublicInbox::SearchIdx->new($git_dir, 1);
-	$s->index_sync;
+	$s->index_sync({ reindex => $reindex });
 }
-- 
EW


^ permalink raw reply related	[flat|nested] only message in thread

only message in thread, other threads:[~2016-07-28 23:09 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-28 23:09 [PATCH] search: support reindexing Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).