dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH] wip-reindex-support
@ 2016-07-11  4:07 Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2016-07-11  4:07 UTC (permalink / raw)
  To: spew

---
 lib/PublicInbox/SearchIdx.pm | 54 ++++++++++++++++++++++++++++++++++++++++----
 script/public-inbox-index    | 14 +++++++++++-
 2 files changed, 63 insertions(+), 5 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c2bf9a2..c3d9d24 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -59,7 +59,7 @@ sub add_message {
 
 	my $doc_id;
 	my $mid = mid_clean(mid_mime($mime));
-	my $was_ghost = 0;
+	my ($was_ghost, $was_mail);
 	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
 
 	eval {
@@ -81,7 +81,15 @@ sub add_message {
 			};
 
 			# probably does not exist:
-			eval { $doc->remove_term($type . 'mail') };
+			eval {
+				$doc->remove_term($type . 'mail');
+				$was_mail = 1;
+			};
+
+			if ($was_mail) {
+				$doc->clear_terms;
+				$doc->clear_values;
+			}
 			$doc->add_term($type . 'mail');
 		}  else {
 			$smsg = PublicInbox::SearchMsg->new($mime);
@@ -323,6 +331,11 @@ sub index_sync {
 	$self->with_umask(sub { $self->_index_sync($head) });
 }
 
+sub reindex {
+	my ($self, $head) = @_;
+	$self->with_umask(sub { $self->_reindex($head) });
+}
+
 sub rlog {
 	my ($self, $range, $add_cb, $del_cb) = @_;
 	my $hex = '[a-f0-9]';
@@ -349,6 +362,33 @@ sub rlog {
 	$latest;
 }
 
+sub _reindex {
+	my ($self, $head) = @_;
+	my $db = $self->{xdb};
+	my $mm = $self->{mm} = eval {
+		require PublicInbox::Msgmap;
+		PublicInbox::Msgmap->new($self->{git_dir}, 1);
+	};
+	$db->begin_transaction;
+	$head ||= $db->get_metadata('last_commit') || 'HEAD';
+	my $git = PublicInbox::Git->new($self->{git_dir});
+	my $ls = $git->popen(qw(ls-tree -r -z), $head);
+	local $/ = "\0";
+	my $h = '[a-f0-9]';
+	while (defined(my $l = <$ls>)) {
+		$l =~ m!\A100644 blob (${h}{40})\t${h}{2}/${h}{38}\0\z! or next;
+		my $bytes;
+		my $mime = do_cat_mail($git, $1, \$bytes) or next;
+		my $num = $mm->num_for(mid_clean(mid_mime($mime)));
+		index_blob($self, $git, $mime, $bytes, $num);
+	}
+	if ($@) {
+		$db->cancel_transaction;
+	} else {
+		$db->commit_transaction;
+	}
+}
+
 # indexes all unindexed messages
 sub _index_sync {
 	my ($self, $head) = @_;
@@ -375,13 +415,19 @@ sub _index_sync {
 				$mm->last_commit($lx);
 			}
 		} else {
-			# dumb case, msgmap and xapian are out-of-sync
-			# do not care for performance:
+			# Uncommon case, msgmap and xapian are out-of-sync
+			# do not care for performance (but git is fast :>)
+			# this happens if we have to reindex Xapian since
+			# msgmap is a frozen format and our Xapian format
+			# is evolving.
 			my $r = $lm eq '' ? $head : "$lm..$head";
+
+			# first, ensure msgmap is up-to-date:
 			$lm = $self->rlog($r, *index_mm, *unindex_mm);
 			$mm->{dbh}->commit;
 			$mm->last_commit($lm) if defined $lm;
 
+			# now deal with Xapian
 			$lx = $self->rlog($range, *index_mm2, *unindex_mm2);
 			$db->set_metadata('last_commit', $lx) if defined $lx;
 		}
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 46584c1..1297f06 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -8,6 +8,7 @@
 
 use strict;
 use warnings;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $usage = "public-inbox-index GIT_DIR";
 use PublicInbox::Config;
 eval { require PublicInbox::SearchIdx };
@@ -15,6 +16,13 @@ if ($@) {
 	print STDERR "Search::Xapian required for $0\n";
 	exit 1;
 }
+
+my $reindex;
+my %opts = (
+	'--reindex' => \$reindex,
+);
+GetOptions(%opts) or die "bad command-line args\n$usage";
+
 my @dirs;
 
 sub resolve_git_dir {
@@ -59,5 +67,9 @@ sub index_dir {
 	-d $git_dir or die "$git_dir does not appear to be a git repository\n";
 
 	my $s = PublicInbox::SearchIdx->new($git_dir, 1);
-	$s->index_sync;
+	if ($reindex) {
+		$s->reindex;
+	} else {
+		$s->index_sync;
+	}
 }

^ permalink raw reply related	[flat|nested] 2+ messages in thread

* [PATCH] wip-reindex-support
@ 2016-07-11  8:04 Eric Wong
  0 siblings, 0 replies; 2+ messages in thread
From: Eric Wong @ 2016-07-11  8:04 UTC (permalink / raw)
  To: spew

---
 lib/PublicInbox/SearchIdx.pm | 67 ++++++++++++++++++++++++++++++++++++++++----
 script/public-inbox-index    | 14 ++++++++-
 2 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index c2bf9a2..8260823 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -58,8 +58,9 @@ sub add_message {
 	my $db = $self->{xdb};
 
 	my $doc_id;
+	my $old_tid;
 	my $mid = mid_clean(mid_mime($mime));
-	my $was_ghost = 0;
+	my ($was_ghost, $was_mail);
 	my $ct_msg = $mime->header('Content-Type') || 'text/plain';
 
 	eval {
@@ -81,8 +82,20 @@ sub add_message {
 			};
 
 			# probably does not exist:
-			eval { $doc->remove_term($type . 'mail') };
-			$doc->add_term($type . 'mail');
+			eval {
+				$doc->remove_term($type . 'mail');
+				$was_mail = 1;
+			};
+
+			if ($was_mail) {
+				$old_tid = $smsg->thread_id;
+				$db->delete_document($smsg->doc_id);
+				$smsg = PublicInbox::SearchMsg->new($mime);
+				$doc = $smsg->{doc};
+				$doc->add_term(xpfx('mid') . $mid);
+			} else {
+				$doc->add_term($type . 'mail');
+			}
 		}  else {
 			$smsg = PublicInbox::SearchMsg->new($mime);
 			$doc = $smsg->{doc};
@@ -154,7 +167,7 @@ sub add_message {
 			$doc->set_data($smsg->to_doc_data);
 			$db->replace_document($doc_id, $doc);
 		} else {
-			$self->link_message($smsg);
+			$self->link_message($smsg, $old_tid);
 			$doc->set_data($smsg->to_doc_data);
 			$doc_id = $db->add_document($doc);
 		}
@@ -323,6 +336,11 @@ sub index_sync {
 	$self->with_umask(sub { $self->_index_sync($head) });
 }
 
+sub reindex {
+	my ($self, $head) = @_;
+	$self->with_umask(sub { $self->_reindex($head) });
+}
+
 sub rlog {
 	my ($self, $range, $add_cb, $del_cb) = @_;
 	my $hex = '[a-f0-9]';
@@ -349,6 +367,37 @@ sub rlog {
 	$latest;
 }
 
+sub _reindex {
+	my ($self, $head) = @_;
+	my $db = $self->{xdb};
+	my $mm = $self->{mm} = eval {
+		require PublicInbox::Msgmap;
+		PublicInbox::Msgmap->new($self->{git_dir}, 1);
+	};
+	$db->begin_transaction;
+	$head ||= $db->get_metadata('last_commit') || 'HEAD';
+	my $git = PublicInbox::Git->new($self->{git_dir});
+	my $ls = $git->popen(qw(ls-tree -r -z), $head);
+	my $nr = 0;
+	local $/ = "\0";
+	my $h = '[a-f0-9]';
+	while (defined(my $l = <$ls>)) {
+		$l =~ m!\A100644 blob (${h}{40})\t${h}{2}/${h}{38}\0\z! or next;
+		my $bytes;
+		my $mime = do_cat_mail($git, $1, \$bytes) or next;
+		my $num = $mm->num_for(mid_clean(mid_mime($mime)));
+		index_blob($self, $git, $mime, $bytes, $num);
+		++$nr;
+		if (!($nr % 8)) {
+			print STDERR "\r$nr" if -t STDERR;
+			$db->commit_transaction;
+			$db->begin_transaction;
+		}
+	}
+	$db->commit_transaction;
+	print STDERR "\rdone\n" if -t STDERR;
+}
+
 # indexes all unindexed messages
 sub _index_sync {
 	my ($self, $head) = @_;
@@ -375,13 +424,19 @@ sub _index_sync {
 				$mm->last_commit($lx);
 			}
 		} else {
-			# dumb case, msgmap and xapian are out-of-sync
-			# do not care for performance:
+			# Uncommon case, msgmap and xapian are out-of-sync
+			# do not care for performance (but git is fast :>)
+			# this happens if we have to reindex Xapian since
+			# msgmap is a frozen format and our Xapian format
+			# is evolving.
 			my $r = $lm eq '' ? $head : "$lm..$head";
+
+			# first, ensure msgmap is up-to-date:
 			$lm = $self->rlog($r, *index_mm, *unindex_mm);
 			$mm->{dbh}->commit;
 			$mm->last_commit($lm) if defined $lm;
 
+			# now deal with Xapian
 			$lx = $self->rlog($range, *index_mm2, *unindex_mm2);
 			$db->set_metadata('last_commit', $lx) if defined $lx;
 		}
diff --git a/script/public-inbox-index b/script/public-inbox-index
index 46584c1..1297f06 100755
--- a/script/public-inbox-index
+++ b/script/public-inbox-index
@@ -8,6 +8,7 @@
 
 use strict;
 use warnings;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
 my $usage = "public-inbox-index GIT_DIR";
 use PublicInbox::Config;
 eval { require PublicInbox::SearchIdx };
@@ -15,6 +16,13 @@ if ($@) {
 	print STDERR "Search::Xapian required for $0\n";
 	exit 1;
 }
+
+my $reindex;
+my %opts = (
+	'--reindex' => \$reindex,
+);
+GetOptions(%opts) or die "bad command-line args\n$usage";
+
 my @dirs;
 
 sub resolve_git_dir {
@@ -59,5 +67,9 @@ sub index_dir {
 	-d $git_dir or die "$git_dir does not appear to be a git repository\n";
 
 	my $s = PublicInbox::SearchIdx->new($git_dir, 1);
-	$s->index_sync;
+	if ($reindex) {
+		$s->reindex;
+	} else {
+		$s->index_sync;
+	}
 }

^ permalink raw reply related	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2016-07-11  8:04 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-07-11  4:07 [PATCH] wip-reindex-support Eric Wong
  -- strict thread matches above, loose matches on Subject: below --
2016-07-11  8:04 Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).