dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH 1/2] msgmap: fix use of transactions
@ 2016-07-31  0:02 Eric Wong
  2016-07-31  0:02 ` [PATCH 2/2] search: support reindexing existing search indices Eric Wong
  0 siblings, 1 reply; 3+ messages in thread
From: Eric Wong @ 2016-07-31  0:02 UTC (permalink / raw)
  To: spew

We want transactions to be the responsibility of the
caller when possible; this fixes the potential for
the msgmap to internally become inconsistent when
using it from inside searchidx.
---
 lib/PublicInbox/Msgmap.pm    | 24 +++++++++---------------
 lib/PublicInbox/SearchIdx.pm | 11 +++--------
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 8fe17a9..2583ff4 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -33,7 +33,9 @@ sub new {
 
 	if ($writable) {
 		create_tables($dbh);
+		$dbh->begin_work;
 		$self->created_at(time) unless $self->created_at;
+		$dbh->commit;
 	}
 	$self;
 }
@@ -51,22 +53,14 @@ sub meta_accessor {
 	defined $value or
 		return $dbh->selectrow_array(meta_select, undef, $key);
 
-	$dbh->begin_work;
-	eval {
-		$prev = $dbh->selectrow_array(meta_select, undef, $key);
+	$prev = $dbh->selectrow_array(meta_select, undef, $key);
 
-		if (defined $prev) {
-			$dbh->do(meta_update, undef, $value, $key);
-		} else {
-			$dbh->do(meta_insert, undef, $key, $value);
-		}
-		$dbh->commit;
-	};
-	my $err = $@;
-	return $prev unless $err;
-
-	$dbh->rollback;
-	die $err;
+	if (defined $prev) {
+		$dbh->do(meta_update, undef, $value, $key);
+	} else {
+		$dbh->do(meta_insert, undef, $key, $value);
+	}
+	$prev;
 }
 
 sub last_commit {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c2bf9a2..3f2643c 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -369,18 +369,18 @@ sub _index_sync {
 			# Common case is the indexes are synced,
 			# we only need to run git-log once:
 			$lx = $self->rlog($range, *index_both, *unindex_both);
-			$mm->{dbh}->commit;
 			if (defined $lx) {
 				$db->set_metadata('last_commit', $lx);
 				$mm->last_commit($lx);
 			}
+			$mm->{dbh}->commit;
 		} else {
 			# dumb case, msgmap and xapian are out-of-sync
 			# do not care for performance:
 			my $r = $lm eq '' ? $head : "$lm..$head";
 			$lm = $self->rlog($r, *index_mm, *unindex_mm);
-			$mm->{dbh}->commit;
 			$mm->last_commit($lm) if defined $lm;
+			$mm->{dbh}->commit;
 
 			$lx = $self->rlog($range, *index_mm2, *unindex_mm2);
 			$db->set_metadata('last_commit', $lx) if defined $lx;
@@ -390,12 +390,7 @@ sub _index_sync {
 		$lx = $self->rlog($range, *index_blob, *unindex_blob);
 		$db->set_metadata('last_commit', $lx) if defined $lx;
 	}
-	if ($@) {
-		$db->cancel_transaction;
-		$mm->{dbh}->rollback if $mm;
-	} else {
-		$db->commit_transaction;
-	}
+	$db->commit_transaction;
 }
 
 # this will create a ghost as necessary
-- 
EW


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 2/2] search: support reindexing existing search indices
  2016-07-31  0:02 [PATCH 1/2] msgmap: fix use of transactions Eric Wong
@ 2016-07-31  0:02 ` Eric Wong
  0 siblings, 0 replies; 3+ messages in thread
From: Eric Wong @ 2016-07-31  0:02 UTC (permalink / raw)
  To: spew

This should make tweaking the way we search more efficiet
by allowing us to avoid doubling destroying the index every
time we want to change something.

We also give priority to incremental indexing via
public-inbox-{watch,mda} and have manual invocations of
public-inbox-index perform batch updates while releasing
ssoma.lock.
---
 lib/PublicInbox/SearchIdx.pm | 104 ++++++++++++++++++++++++-------------------
 script/public-inbox-index    |  14 +++++-
 2 files changed, 71 insertions(+), 47 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 3f2643c..0fe8be6 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -9,6 +9,7 @@
 package PublicInbox::SearchIdx;
 use strict;
 use warnings;
+use Fcntl qw(:flock :DEFAULT);
 use Email::MIME;
 use Email::MIME::ContentType;
 $Email::MIME::ContentType::STRICT_PARAMS = 0;
@@ -41,7 +42,14 @@ sub new {
 			require File::Path;
 			File::Path::mkpath($dir);
 			$flag = Search::Xapian::DB_CREATE_OR_OPEN;
+			my $lockpath = "$git_dir/ssoma.lock";
+			sysopen(my $lockfh, $lockpath, O_WRONLY|O_CREAT) or
+				die "failed to open lock $lockpath: $!";
+			flock($lockfh, LOCK_EX) or die "lock failed: $!\n";
+			$self->{batch_size} = 100;
+			$self->{lockfh} = $lockfh;
 		}
+
 		Search::Xapian::WritableDatabase->new($dir, $flag);
 	});
 	$self;
@@ -57,40 +65,24 @@ sub add_message {
 	my ($self, $mime, $bytes, $num) = @_; # mime = Email::MIME object
 	my $db = $self->{xdb};
 
-	my $doc_id;
+	my ($doc_id, $old_tid);
 	my $mid = mid_clean(mid_mime($mime));
-	my $was_ghost = 0;
 	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
 
 	eval {
 		die 'Message-ID too long' if length($mid) > MAX_MID_SIZE;
 		my $smsg = $self->lookup_message($mid);
-		my $doc;
-
 		if ($smsg) {
-			$smsg->ensure_metadata;
 			# convert a ghost to a regular message
 			# it will also clobber any existing regular message
-			$smsg->mime($mime);
-			$doc = $smsg->{doc};
-
-			my $type = xpfx('type');
-			eval {
-				$doc->remove_term($type . 'ghost');
-				$was_ghost = 1;
-			};
-
-			# probably does not exist:
-			eval { $doc->remove_term($type . 'mail') };
-			$doc->add_term($type . 'mail');
-		}  else {
-			$smsg = PublicInbox::SearchMsg->new($mime);
-			$doc = $smsg->{doc};
-			$doc->add_term(xpfx('mid') . $mid);
+			$doc_id = $smsg->doc_id;
+			$old_tid = $smsg->thread_id;
 		}
+		$smsg = PublicInbox::SearchMsg->new($mime);
+		my $doc = $smsg->{doc};
+		$doc->add_term(xpfx('mid') . $mid);
 
 		my $subj = $smsg->subject;
-
 		if ($subj ne '') {
 			my $path = $self->subject_path($subj);
 			$doc->add_term(xpfx('path') . id_compress($path));
@@ -148,14 +140,11 @@ sub add_message {
 			}
 		});
 
-		if ($was_ghost) {
-			$doc_id = $smsg->doc_id;
-			$self->link_message($smsg, $smsg->thread_id);
-			$doc->set_data($smsg->to_doc_data);
+		link_message($self, $smsg, $old_tid);
+		$doc->set_data($smsg->to_doc_data);
+		if (defined $doc_id) {
 			$db->replace_document($doc_id, $doc);
 		} else {
-			$self->link_message($smsg);
-			$doc->set_data($smsg->to_doc_data);
 			$doc_id = $db->add_document($doc);
 		}
 	};
@@ -252,9 +241,7 @@ sub link_message {
 		# the rest of the refs should point to this tid:
 		foreach $ref (@refs) {
 			my $ptid = $self->_resolve_mid_to_tid($ref);
-			if ($tid ne $ptid) {
-				$self->merge_threads($tid, $ptid);
-			}
+			merge_threads($self, $tid, $ptid);
 		}
 	} else {
 		$tid = $self->next_thread_id;
@@ -319,8 +306,8 @@ sub do_cat_mail {
 }
 
 sub index_sync {
-	my ($self, $head) = @_;
-	$self->with_umask(sub { $self->_index_sync($head) });
+	my ($self, $opts) = @_;
+	with_umask($self, sub { $self->_index_sync($opts) });
 }
 
 sub rlog {
@@ -334,8 +321,10 @@ sub rlog {
 				--raw -r --no-abbrev/, $range);
 	my $latest;
 	my $bytes;
+	my $max = $self->{batch_size}; # may be undef
 	local $/ = "\n";
-	while (defined(my $line = <$log>)) {
+	my $line;
+	while (defined($line = <$log>)) {
 		if ($line =~ /$addmsg/o) {
 			my $mime = do_cat_mail($git, $1, \$bytes) or next;
 			$add_cb->($self, $git, $mime, $bytes);
@@ -343,25 +332,37 @@ sub rlog {
 			my $mime = do_cat_mail($git, $1) or next;
 			$del_cb->($self, $git, $mime);
 		} elsif ($line =~ /^commit ($h40)/o) {
+			last if defined $max && --$max <= 0;
 			$latest = $1;
 		}
 	}
+	$self->{-rlog_done}++ if !defined $line;
 	$latest;
 }
 
 # indexes all unindexed messages
 sub _index_sync {
-	my ($self, $head) = @_;
+	my ($self, $opts) = @_;
 	my $db = $self->{xdb};
-	$head ||= 'HEAD';
+	my $head = 'HEAD';
 	my $mm = $self->{mm} = eval {
 		require PublicInbox::Msgmap;
 		PublicInbox::Msgmap->new($self->{git_dir}, 1);
 	};
 
 	$db->begin_transaction;
-	my $lx = $db->get_metadata('last_commit');
+	my $reindex = $opts->{reindex};
+	my $last_xap = $reindex ? 'last_reindex' : 'last_commit';
+	my $lx;
+	if ($reindex && !$opts->{reindex_started}) {
+		$opts->{reindex_started} = 1;
+		$lx = '';
+	} else {
+		$lx = $db->get_metadata($last_xap);
+	}
 	my $range = $lx eq '' ? $head : "$lx..$head";
+	$self->{-rlog_done} = 0;
+	my $need_done = 1;
 	if ($mm) {
 		$mm->{dbh}->begin_work;
 		my $lm = $mm->last_commit || '';
@@ -369,28 +370,34 @@ sub _index_sync {
 			# Common case is the indexes are synced,
 			# we only need to run git-log once:
 			$lx = $self->rlog($range, *index_both, *unindex_both);
-			if (defined $lx) {
-				$db->set_metadata('last_commit', $lx);
-				$mm->last_commit($lx);
-			}
+			$mm->last_commit($lx) if defined $lx;
 			$mm->{dbh}->commit;
 		} else {
-			# dumb case, msgmap and xapian are out-of-sync
-			# do not care for performance:
+			$need_done = 2;
+			# Uncommon case, msgmap and xapian are out-of-sync
+			# do not care for performance (but git is fast :>)
+			# This happens if we have to reindex Xapian since
+			# msgmap is a frozen format and our Xapian format
+			# is evolving.
 			my $r = $lm eq '' ? $head : "$lm..$head";
+
+			# first, ensure msgmap is up-to-date:
 			$lm = $self->rlog($r, *index_mm, *unindex_mm);
 			$mm->last_commit($lm) if defined $lm;
 			$mm->{dbh}->commit;
 
+			# now deal with Xapian
 			$lx = $self->rlog($range, *index_mm2, *unindex_mm2);
-			$db->set_metadata('last_commit', $lx) if defined $lx;
 		}
 	} else {
 		# user didn't install DBD::SQLite and DBI
 		$lx = $self->rlog($range, *index_blob, *unindex_blob);
-		$db->set_metadata('last_commit', $lx) if defined $lx;
 	}
+	$db->set_metadata($last_xap, $lx) if defined $lx;
+	my $done = $need_done == $self->{-rlog_done};
+	$db->set_metadata('last_reindex', '') if $done;
 	$db->commit_transaction;
+	$done;
 }
 
 # this will create a ghost as necessary
@@ -418,6 +425,7 @@ sub create_ghost {
 
 sub merge_threads {
 	my ($self, $winner_tid, $loser_tid) = @_;
+	return if $winner_tid == $loser_tid;
 	my ($head, $tail) = $self->find_doc_ids('thread', $loser_tid);
 	my $thread_pfx = xpfx('thread');
 	my $db = $self->{xdb};
@@ -487,4 +495,10 @@ sub with_umask {
 	$rv;
 }
 
+sub DESTROY {
+	# order matters for unlocking
+	$_[0]->{xdb} = undef;
+	$_[0]->{lockfh} = undef;
+}
+
 1;
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 46584c1..6121c7b 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -8,6 +8,7 @@
 
 use strict;
 use warnings;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $usage = "public-inbox-index GIT_DIR";
 use PublicInbox::Config;
 eval { require PublicInbox::SearchIdx };
@@ -15,6 +16,11 @@ if ($@) {
 	print STDERR "Search::Xapian required for $0\n";
 	exit 1;
 }
+
+my $reindex;
+my %opts = ( '--reindex' => \$reindex );
+GetOptions(%opts) or die "bad command-line args\n$usage";
+
 my @dirs;
 
 sub resolve_git_dir {
@@ -57,7 +63,11 @@ foreach my $dir (@dirs) {
 sub index_dir {
 	my ($git_dir) = @_;
 	-d $git_dir or die "$git_dir does not appear to be a git repository\n";
+	my $done = 0;
+	my $opts = { reindex => $reindex };
 
-	my $s = PublicInbox::SearchIdx->new($git_dir, 1);
-	$s->index_sync;
+	until ($done) {
+		my $s = PublicInbox::SearchIdx->new($git_dir, 1);
+		$done = $s->index_sync($opts);
+	}
 }
-- 
EW


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH 1/2] msgmap: fix use of transactions
@ 2016-07-31  7:28 Eric Wong
  0 siblings, 0 replies; 3+ messages in thread
From: Eric Wong @ 2016-07-31  7:28 UTC (permalink / raw)
  To: spew

We want transactions to be the responsibility of the
caller when possible; this fixes the potential for
the msgmap to internally become inconsistent when
using it from inside searchidx.
---
 lib/PublicInbox/Msgmap.pm    | 24 +++++++++---------------
 lib/PublicInbox/SearchIdx.pm | 11 +++--------
 2 files changed, 12 insertions(+), 23 deletions(-)

diff --git a/lib/PublicInbox/Msgmap.pm b/lib/PublicInbox/Msgmap.pm
index 8fe17a9..2583ff4 100644
--- a/lib/PublicInbox/Msgmap.pm
+++ b/lib/PublicInbox/Msgmap.pm
@@ -33,7 +33,9 @@ sub new {
 
 	if ($writable) {
 		create_tables($dbh);
+		$dbh->begin_work;
 		$self->created_at(time) unless $self->created_at;
+		$dbh->commit;
 	}
 	$self;
 }
@@ -51,22 +53,14 @@ sub meta_accessor {
 	defined $value or
 		return $dbh->selectrow_array(meta_select, undef, $key);
 
-	$dbh->begin_work;
-	eval {
-		$prev = $dbh->selectrow_array(meta_select, undef, $key);
+	$prev = $dbh->selectrow_array(meta_select, undef, $key);
 
-		if (defined $prev) {
-			$dbh->do(meta_update, undef, $value, $key);
-		} else {
-			$dbh->do(meta_insert, undef, $key, $value);
-		}
-		$dbh->commit;
-	};
-	my $err = $@;
-	return $prev unless $err;
-
-	$dbh->rollback;
-	die $err;
+	if (defined $prev) {
+		$dbh->do(meta_update, undef, $value, $key);
+	} else {
+		$dbh->do(meta_insert, undef, $key, $value);
+	}
+	$prev;
 }
 
 sub last_commit {
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c2bf9a2..3f2643c 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -369,18 +369,18 @@ sub _index_sync {
 			# Common case is the indexes are synced,
 			# we only need to run git-log once:
 			$lx = $self->rlog($range, *index_both, *unindex_both);
-			$mm->{dbh}->commit;
 			if (defined $lx) {
 				$db->set_metadata('last_commit', $lx);
 				$mm->last_commit($lx);
 			}
+			$mm->{dbh}->commit;
 		} else {
 			# dumb case, msgmap and xapian are out-of-sync
 			# do not care for performance:
 			my $r = $lm eq '' ? $head : "$lm..$head";
 			$lm = $self->rlog($r, *index_mm, *unindex_mm);
-			$mm->{dbh}->commit;
 			$mm->last_commit($lm) if defined $lm;
+			$mm->{dbh}->commit;
 
 			$lx = $self->rlog($range, *index_mm2, *unindex_mm2);
 			$db->set_metadata('last_commit', $lx) if defined $lx;
@@ -390,12 +390,7 @@ sub _index_sync {
 		$lx = $self->rlog($range, *index_blob, *unindex_blob);
 		$db->set_metadata('last_commit', $lx) if defined $lx;
 	}
-	if ($@) {
-		$db->cancel_transaction;
-		$mm->{dbh}->rollback if $mm;
-	} else {
-		$db->commit_transaction;
-	}
+	$db->commit_transaction;
 }
 
 # this will create a ghost as necessary
-- 
EW


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2016-07-31  7:28 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-31  0:02 [PATCH 1/2] msgmap: fix use of transactions Eric Wong
2016-07-31  0:02 ` [PATCH 2/2] search: support reindexing existing search indices Eric Wong
  -- strict thread matches above, loose matches on Subject: below --
2016-07-31  7:28 [PATCH 1/2] msgmap: fix use of transactions Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).