dumping ground for random patches and texts
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: spew@80x24.org
Subject: [PATCH] WIP
Date: Sun, 27 Dec 2020 11:36:10 +0000	[thread overview]
Message-ID: <20201227113610.5902-1-e@80x24.org> (raw)

---
 MANIFEST                      |  2 ++
 lib/PublicInbox/LeiSearch.pm  | 14 ++++-----
 lib/PublicInbox/LeiXSearch.pm | 57 +++++++++++++++++++++++++++++++++++
 lib/PublicInbox/Search.pm     | 19 +++++-------
 t/lei_xsearch.t               | 47 +++++++++++++++++++++++++++++
 5 files changed, 119 insertions(+), 20 deletions(-)
 create mode 100644 lib/PublicInbox/LeiXSearch.pm
 create mode 100644 t/lei_xsearch.t

diff --git a/MANIFEST b/MANIFEST
index 656c707e..a5ff81cf 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -165,6 +165,7 @@ lib/PublicInbox/LEI.pm
 lib/PublicInbox/LeiExtinbox.pm
 lib/PublicInbox/LeiSearch.pm
 lib/PublicInbox/LeiStore.pm
+lib/PublicInbox/LeiXSearch.pm
 lib/PublicInbox/Linkify.pm
 lib/PublicInbox/Listener.pm
 lib/PublicInbox/Lock.pm
@@ -327,6 +328,7 @@ t/kqnotify.t
 t/lei-oneshot.t
 t/lei.t
 t/lei_store.t
+t/lei_xsearch.t
 t/linkify.t
 t/main-bin/spamc
 t/mda-mime.eml
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index 66c16e04..0b962b11 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -7,20 +7,18 @@ use v5.10.1;
 use parent qw(PublicInbox::ExtSearch);
 use PublicInbox::Search;
 
-sub combined_docid ($$) {
+# get combined docid from over.num:
+# (not generic Xapian, only works with our sharding scheme)
+sub num2docid ($$) {
 	my ($self, $num) = @_;
-	($num - 1) * $self->{nshard} + 1;
+	my $nshard = $self->{nshard};
+	($num - 1) * $nshard + $num % $nshard + 1;
 }
 
 sub msg_keywords {
 	my ($self, $num) = @_; # num_or_mitem
 	my $xdb = $self->xdb; # set {nshard};
-	my $docid = ref($num) ? $num->get_docid : do {
-		# get combined docid from over.num:
-		# (not generic Xapian, only works with our sharding scheme)
-		my $nshard = $self->{nshard};
-		($num - 1) * $nshard + $num % $nshard + 1;
-	};
+	my $docid = ref($num) ? $num->get_docid : num2docid($self, $num);
 	my %kw;
 	eval {
 		my $end = $xdb->termlist_end($docid);
diff --git a/lib/PublicInbox/LeiXSearch.pm b/lib/PublicInbox/LeiXSearch.pm
new file mode 100644
index 00000000..07ceb84e
--- /dev/null
+++ b/lib/PublicInbox/LeiXSearch.pm
@@ -0,0 +1,57 @@
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# Combine any combination of PublicInbox::Search,
+# PublicInbox::ExtSearch, and PublicInbox::LeiSearch objects
+# into one Xapian DB
+package PublicInbox::LeiXSearch;
+use strict;
+use v5.10.1;
+use parent qw(PublicInbox::LeiSearch);
+
+sub new {
+	my ($class) = @_;
+	PublicInbox::Search::load_xapian();
+	bless {}, $class
+}
+
+sub attach_extinbox {
+	my ($self, $ibxish) = @_; # ibxish = ExtSearch or Inbox
+	if (delete $self->{xdb}) {
+		# clobber existing {xdb} if amending
+		my $expect = delete $self->{nshard};
+		my $shards = delete $self->{shards_flat};
+		scalar(@$shards) == $expect or die
+			"BUG: {nshard}$expect != shards=".scalar(@$shards);
+
+		my $prev = {};
+		for my $old_ibxish (@{$self->{shard2ibx}}) {
+			next if $prev == $old_ibxish;
+			$prev = $old_ibxish;
+			my @shards = $old_ibxish->search->xdb_shards_flat;
+			push @{$self->{shards_flat}}, @shards;
+		}
+		my $nr = scalar(@{$self->{shards_flat}});
+		$nr == $expect or die
+			"BUG: reloaded $nr shards, expected $expect"
+	}
+	my @shards = $ibxish->search->xdb_shards_flat;
+	push @{$self->{shards_flat}}, @shards;
+	push(@{$self->{shard2ibx}}, $ibxish) for (@shards);
+}
+
+# called by PublicInbox::Search::xdb
+sub xdb_shards_flat { @{$_[0]->{shards_flat}} }
+
+# like over->get_art
+sub smsg_for {
+	my ($self, $mitem) = @_;
+	# cf. https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
+	my $nshard = $self->{nshard};
+	my $docid = $mitem->get_docid;
+	my $shard = ($docid - 1) % $nshard;
+	my $num = int(($docid - 1) / $nshard) + 1;
+	$self->{shard2ibx}->[$shard]->over->get_art($num);
+}
+
+1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 498b6632..73926b1c 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -197,6 +197,7 @@ sub xdb_shards_flat ($) {
 	my ($self) = @_;
 	my $xpfx = $self->{xpfx};
 	my (@xdb, $slow_phrase);
+	load_xapian();
 	if ($xpfx =~ m/xapian${\SCHEMA_VERSION}\z/) {
 		@xdb = ($X{Database}->new($xpfx));
 		$self->{qp_flags} |= FLAG_PHRASE() if !-f "$xpfx/iamchert";
@@ -215,16 +216,6 @@ sub xdb_shards_flat ($) {
 	@xdb;
 }
 
-sub _xdb {
-	my ($self) = @_;
-	$self->{qp_flags} //= $QP_FLAGS;
-	my @xdb = xdb_shards_flat($self) or return;
-	$self->{nshard} = scalar(@xdb);
-	my $xdb = shift @xdb;
-	$xdb->add_database($_) for @xdb;
-	$xdb;
-}
-
 # v2 Xapian docids don't conflict, so they're identical to
 # NNTP article numbers and IMAP UIDs.
 # https://trac.xapian.org/wiki/FAQ/MultiDatabaseDocumentID
@@ -243,8 +234,12 @@ sub mset_to_artnums {
 sub xdb ($) {
 	my ($self) = @_;
 	$self->{xdb} //= do {
-		load_xapian();
-		$self->_xdb;
+		$self->{qp_flags} //= $QP_FLAGS;
+		my @xdb = $self->xdb_shards_flat or return;
+		$self->{nshard} = scalar(@xdb);
+		my $xdb = shift @xdb;
+		$xdb->add_database($_) for @xdb;
+		$xdb;
 	};
 }
 
diff --git a/t/lei_xsearch.t b/t/lei_xsearch.t
new file mode 100644
index 00000000..70e3affd
--- /dev/null
+++ b/t/lei_xsearch.t
@@ -0,0 +1,47 @@
+#!perl -w
+# Copyright (C) 2020 all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use strict;
+use v5.10.1;
+use Test::More;
+use PublicInbox::TestCommon;
+use PublicInbox::ExtSearchIdx;
+use PublicInbox::Eml;
+use PublicInbox::InboxWritable;
+require_mods(qw(DBD::SQLite Search::Xapian));
+require_git 2.6;
+require_ok 'PublicInbox::LeiXSearch';
+my ($home, $for_destroy) = tmpdir();
+my @ibx;
+for my $V (1..2) {
+	for my $i (3..6) {
+		my $ibx = PublicInbox::InboxWritable->new({
+			inboxdir => "$home/v$V-$i",
+			name => "test-v$V-$i",
+			version => $V,
+			indexlevel => 'medium',
+			-primary_address => "v$V-$i\@example.com",
+		}, { nproc => int(rand(8)) + 1 });
+		push @ibx, $ibx;
+		my $im = $ibx->importer(0);
+		my $eml = PublicInbox::Eml->new(<<EOF);
+From: x\@example.com
+To: $ibx->{-primary_address}
+Date: Fri, 02 Oct 1993 0$V:0$i:00 +0000
+Subject: v${V}i${i}
+
+${V}er ${i}on
+EOF
+		$im->add($eml);
+		$im->done;
+	}
+}
+my $first = shift @ibx; is($first->{name}, 'test-v1-3', 'first plucked');
+my $last = pop @ibx; is($last->{name}, 'test-v2-6', 'last plucked');
+my $eidx = PublicInbox::ExtSearchIdx->new("$home/eidx");
+$eidx->attach_inbox($first);
+$eidx->attach_inbox($last);
+$eidx->eidx_sync({fsync => 0});
+my $lxs = PublicInbox::LeiXSearch->new;
+
+done_testing;

             reply	other threads:[~2020-12-27 11:36 UTC|newest]

Thread overview: 23+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-12-27 11:36 Eric Wong [this message]
  -- strict thread matches above, loose matches on Subject: below --
2021-10-27 20:16 [PATCH] wip Eric Wong
2021-06-05 19:58 Eric Wong
2021-04-05  7:42 Eric Wong
2021-03-08  7:11 Eric Wong
2021-01-21  4:24 [PATCH] WIP Eric Wong
2021-01-03 22:57 [PATCH] wip Eric Wong
2020-11-15  7:35 Eric Wong
2020-04-23  4:27 Eric Wong
2020-04-20  7:14 Eric Wong
2020-01-13  9:24 [PATCH] WIP Eric Wong
2019-05-11 22:55 Eric Wong
2019-01-02  9:21 [PATCH] wip Eric Wong
2018-07-06 21:31 Eric Wong
2018-06-24 11:55 Eric Wong
2018-06-24  8:39 Eric Wong
2017-07-15  1:42 [PATCH] WIP Eric Wong
2017-04-12 20:17 [PATCH] wip Eric Wong
2017-04-05 18:40 Eric Wong
2016-08-23 20:07 Eric Wong
2016-08-18  2:16 Eric Wong
2016-06-26  3:46 Eric Wong
2015-12-22  0:15 Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20201227113610.5902-1-e@80x24.org \
    --to=e@80x24.org \
    --cc=spew@80x24.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).