From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF, T_SCC_BODY_TEXT_LINE shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 99CC61F406 for ; Thu, 7 Dec 2023 11:42:46 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1701949366; bh=zAKhH2YphyMdFdnBwvObQP8j+2BTv2SCBrrQNqTxdrA=; h=From:To:Subject:Date:From; b=NewNkt9nvBOoIekuNAxzotlfPr+eGE94nPgN9S9BOQ2RsqvzYhtxQ7ruo/r2UnUnR 0TWL9jBfPkCHb8iZc19r5mb4Z26F51gJZ4adjJRa5Bapd+1/EnyWokQL1GKsgivBPy 8Sr3D9z2V3Nj1T5gEMxhmm0sKH6/4tol8Q/P49PE= From: Eric Wong To: spew@80x24.org Subject: [PATCH 1/5] *search: simplify handling of Xapian term iterators Date: Thu, 7 Dec 2023 11:42:42 +0000 Message-ID: <20231207114246.3614375-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Xapian has always sorted termlist iterators, so we now: 1) break out of the iterator loop early on non-matches 2) avoid doing sorting ourselves As a result, we'll also favor the wantarray forms of xap_terms and all_terms to preserve sort order in most cases. Confirmed by the Xapian maintainer: <20231201184844.GO4059@survex.com> Link: https://lists.xapian.org/pipermail/xapian-discuss/2023-December/010013.html --- lib/PublicInbox/LeiInspect.pm | 1 - lib/PublicInbox/Search.pm | 19 ++++++++++--------- lib/PublicInbox/SearchIdx.pm | 13 ++++++------- lib/PublicInbox/xh_cidx.h | 15 +++++---------- lib/PublicInbox/xh_mset.h | 2 +- 5 files changed, 22 insertions(+), 28 deletions(-) diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm index d4ad03eb..88d7949c 100644 --- a/lib/PublicInbox/LeiInspect.pm +++ b/lib/PublicInbox/LeiInspect.pm @@ -97,7 +97,6 @@ sub _inspect_doc ($$) { my $term = ($1 // ''); push @{$ent->{terms}->{$term}}, $tn; } - @$_ = sort(@$_) for values %{$ent->{terms} // {}}; $cur = $doc->values_begin; $end = $doc->values_end; for (; $cur != $end; $cur++) { diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm index 8ef17d58..678c8c5d 100644 --- a/lib/PublicInbox/Search.pm +++ b/lib/PublicInbox/Search.pm @@ -614,16 +614,16 @@ sub get_pct ($) { # mset item sub xap_terms ($$;@) { my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty () - my %ret; my $end = $xdb_or_doc->termlist_end(@docid); my $cur = $xdb_or_doc->termlist_begin(@docid); + $cur->skip_to($pfx); + my (@ret, $tn); + my $pfxlen = length($pfx); for (; $cur != $end; $cur++) { - $cur->skip_to($pfx); - last if $cur == $end; - my $tn = $cur->get_termname; - $ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx); + $tn = $cur->get_termname; + index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen)); } - wantarray ? sort(keys(%ret)) : \%ret; + wantarray ? @ret : +{ map { $_ => undef } @ret }; } # get combined docid from over.num: @@ -638,11 +638,12 @@ sub all_terms { my ($self, $pfx) = @_; my $cur = xdb($self)->allterms_begin($pfx); my $end = $self->{xdb}->allterms_end($pfx); - my %ret; + my $pfxlen = length($pfx); + my @ret; for (; $cur != $end; $cur++) { - $ret{substr($cur->get_termname, length($pfx))} = undef; + push @ret, substr($cur->get_termname, $pfxlen); } - wantarray ? (sort keys %ret) : \%ret; + wantarray ? @ret : +{ map { $_ => undef } @ret }; } sub xh_args { # prep getopt args to feed to xap_helper.h socket diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm index 1bf471fc..1ac8e33e 100644 --- a/lib/PublicInbox/SearchIdx.pm +++ b/lib/PublicInbox/SearchIdx.pm @@ -42,7 +42,7 @@ my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/; my $xapianlevels = qr/\A(?:full|medium)\z/; my $hex = '[a-f0-9]'; my $OID = $hex .'{40,}'; -my @VMD_MAP = (kw => 'K', L => 'L'); +my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/; sub new { @@ -608,17 +608,16 @@ sub set_vmd { my ($self, $docid, $vmd) = @_; begin_txn_lazy($self); my $doc = _get_doc($self, $docid) or return; - my ($end, @rm, @add); + my ($v, @rm, @add); my @x = @VMD_MAP; + my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end); while (my ($field, $pfx) = splice(@x, 0, 2)) { my $set = $vmd->{$field} // next; my %keep = map { $_ => 1 } @$set; my %add = %keep; - $end //= $doc->termlist_end; - for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) { - $cur->skip_to($pfx); - last if $cur == $end; - my $v = $cur->get_termname; + $cur->skip_to($pfx); # works due to @VMD_MAP order + for (; $cur != $end; $cur++) { + $v = $cur->get_termname; $v =~ s/\A$pfx//s or next; $keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v); } diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h index 1980f9f6..2803b3a4 100644 --- a/lib/PublicInbox/xh_cidx.h +++ b/lib/PublicInbox/xh_cidx.h @@ -12,12 +12,9 @@ static void dump_ibx_term(struct req *req, const char *pfx, for (cur.skip_to(pfx); cur != end; cur++) { std::string tn = *cur; - - if (starts_with(&tn, pfx, pfx_len)) { - fprintf(req->fp[0], "%s %s\n", - tn.c_str() + pfx_len, ibx_id); - ++req->nr_out; - } + if (!starts_with(&tn, pfx, pfx_len)) break; + fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id); + ++req->nr_out; } } @@ -95,8 +92,7 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc) fbuf_init(root_offs); for (cur.skip_to("G"); cur != end; cur++) { std::string tn = *cur; - if (!starts_with(&tn, "G", 1)) - continue; + if (!starts_with(&tn, "G", 1)) break; union { const char *in; char *out; } u; u.in = tn.c_str() + 1; e.key = u.out; @@ -125,8 +121,7 @@ static void dump_roots_term(struct req *req, const char *pfx, for (cur.skip_to(pfx); cur != end; cur++) { std::string tn = *cur; - if (!starts_with(&tn, pfx, pfx_len)) - continue; + if (!starts_with(&tn, pfx, pfx_len)) break; fputs(tn.c_str() + pfx_len, drt->wbuf.fp); fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp); ++req->nr_out; diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h index 056fe22b..4e97a284 100644 --- a/lib/PublicInbox/xh_mset.h +++ b/lib/PublicInbox/xh_mset.h @@ -11,7 +11,7 @@ static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc) for (cur.skip_to(pfx); cur != end; cur++) { std::string tn = *cur; - if (!starts_with(&tn, pfx, pfx_len)) continue; + if (!starts_with(&tn, pfx, pfx_len)) break; fputc(0, fp); fwrite(tn.data(), tn.size(), 1, fp); }