dumping ground for random patches and texts
 help / color / mirror / Atom feed
From: Eric Wong <e@80x24.org>
To: spew@80x24.org
Subject: [PATCH 1/5] *search: simplify handling of Xapian term iterators
Date: Thu,  7 Dec 2023 11:42:42 +0000	[thread overview]
Message-ID: <20231207114246.3614375-1-e@80x24.org> (raw)

Xapian has always sorted termlist iterators, so we now:

1) break out of the iterator loop early on non-matches
2) avoid doing sorting ourselves

As a result, we'll also favor the wantarray forms of xap_terms
and all_terms to preserve sort order in most cases.

Confirmed by the Xapian maintainer: <20231201184844.GO4059@survex.com>

Link: https://lists.xapian.org/pipermail/xapian-discuss/2023-December/010013.html
---
 lib/PublicInbox/LeiInspect.pm |  1 -
 lib/PublicInbox/Search.pm     | 19 ++++++++++---------
 lib/PublicInbox/SearchIdx.pm  | 13 ++++++-------
 lib/PublicInbox/xh_cidx.h     | 15 +++++----------
 lib/PublicInbox/xh_mset.h     |  2 +-
 5 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index d4ad03eb..88d7949c 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -97,7 +97,6 @@ sub _inspect_doc ($$) {
 		my $term = ($1 // '');
 		push @{$ent->{terms}->{$term}}, $tn;
 	}
-	@$_ = sort(@$_) for values %{$ent->{terms} // {}};
 	$cur = $doc->values_begin;
 	$end = $doc->values_end;
 	for (; $cur != $end; $cur++) {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 8ef17d58..678c8c5d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -614,16 +614,16 @@ sub get_pct ($) { # mset item
 
 sub xap_terms ($$;@) {
 	my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
-	my %ret;
 	my $end = $xdb_or_doc->termlist_end(@docid);
 	my $cur = $xdb_or_doc->termlist_begin(@docid);
+	$cur->skip_to($pfx);
+	my (@ret, $tn);
+	my $pfxlen = length($pfx);
 	for (; $cur != $end; $cur++) {
-		$cur->skip_to($pfx);
-		last if $cur == $end;
-		my $tn = $cur->get_termname;
-		$ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx);
+		$tn = $cur->get_termname;
+		index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
 	}
-	wantarray ? sort(keys(%ret)) : \%ret;
+	wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 # get combined docid from over.num:
@@ -638,11 +638,12 @@ sub all_terms {
 	my ($self, $pfx) = @_;
 	my $cur = xdb($self)->allterms_begin($pfx);
 	my $end = $self->{xdb}->allterms_end($pfx);
-	my %ret;
+	my $pfxlen = length($pfx);
+	my @ret;
 	for (; $cur != $end; $cur++) {
-		$ret{substr($cur->get_termname, length($pfx))} = undef;
+		push @ret, substr($cur->get_termname, $pfxlen);
 	}
-	wantarray ? (sort keys %ret) : \%ret;
+	wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 sub xh_args { # prep getopt args to feed to xap_helper.h socket
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1bf471fc..1ac8e33e 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -42,7 +42,7 @@ my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
-my @VMD_MAP = (kw => 'K', L => 'L');
+my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters
 our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
 
 sub new {
@@ -608,17 +608,16 @@ sub set_vmd {
 	my ($self, $docid, $vmd) = @_;
 	begin_txn_lazy($self);
 	my $doc = _get_doc($self, $docid) or return;
-	my ($end, @rm, @add);
+	my ($v, @rm, @add);
 	my @x = @VMD_MAP;
+	my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end);
 	while (my ($field, $pfx) = splice(@x, 0, 2)) {
 		my $set = $vmd->{$field} // next;
 		my %keep = map { $_ => 1 } @$set;
 		my %add = %keep;
-		$end //= $doc->termlist_end;
-		for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
-			$cur->skip_to($pfx);
-			last if $cur == $end;
-			my $v = $cur->get_termname;
+		$cur->skip_to($pfx); # works due to @VMD_MAP order
+		for (; $cur != $end; $cur++) {
+			$v = $cur->get_termname;
 			$v =~ s/\A$pfx//s or next;
 			$keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v);
 		}
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 1980f9f6..2803b3a4 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -12,12 +12,9 @@ static void dump_ibx_term(struct req *req, const char *pfx,
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-
-		if (starts_with(&tn, pfx, pfx_len)) {
-			fprintf(req->fp[0], "%s %s\n",
-				tn.c_str() + pfx_len, ibx_id);
-			++req->nr_out;
-		}
+		if (!starts_with(&tn, pfx, pfx_len)) break;
+		fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
+		++req->nr_out;
 	}
 }
 
@@ -95,8 +92,7 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
 	fbuf_init(root_offs);
 	for (cur.skip_to("G"); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, "G", 1))
-			continue;
+		if (!starts_with(&tn, "G", 1)) break;
 		union { const char *in; char *out; } u;
 		u.in = tn.c_str() + 1;
 		e.key = u.out;
@@ -125,8 +121,7 @@ static void dump_roots_term(struct req *req, const char *pfx,
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, pfx, pfx_len))
-			continue;
+		if (!starts_with(&tn, pfx, pfx_len)) break;
 		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
 		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
 		++req->nr_out;
diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h
index 056fe22b..4e97a284 100644
--- a/lib/PublicInbox/xh_mset.h
+++ b/lib/PublicInbox/xh_mset.h
@@ -11,7 +11,7 @@ static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc)
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, pfx, pfx_len)) continue;
+		if (!starts_with(&tn, pfx, pfx_len)) break;
 		fputc(0, fp);
 		fwrite(tn.data(), tn.size(), 1, fp);
 	}

             reply	other threads:[~2023-12-07 11:42 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-12-07 11:42 Eric Wong [this message]
2023-12-07 11:42 ` [PATCH 2/5] *search: favor wantarray form of xap_terms Eric Wong
2023-12-07 11:42 ` [PATCH 3/5] xap_helper_cxx: drop chdir usage in build Eric Wong
2023-12-07 11:42 ` [PATCH 4/5] makefile: add `check-build' target Eric Wong
2023-12-07 11:42 ` [PATCH 5/5] xap_helper: support term length limit Eric Wong

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20231207114246.3614375-1-e@80x24.org \
    --to=e@80x24.org \
    --cc=spew@80x24.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).