dumping ground for random patches and texts
 help / color / mirror / Atom feed
* [PATCH 1/5] *search: simplify handling of Xapian term iterators
@ 2023-12-07 11:42 Eric Wong
  2023-12-07 11:42 ` [PATCH 2/5] *search: favor wantarray form of xap_terms Eric Wong
                   ` (3 more replies)
  0 siblings, 4 replies; 5+ messages in thread
From: Eric Wong @ 2023-12-07 11:42 UTC (permalink / raw)
  To: spew

Xapian has always sorted termlist iterators, so we now:

1) break out of the iterator loop early on non-matches
2) avoid doing sorting ourselves

As a result, we'll also favor the wantarray forms of xap_terms
and all_terms to preserve sort order in most cases.

Confirmed by the Xapian maintainer: <20231201184844.GO4059@survex.com>

Link: https://lists.xapian.org/pipermail/xapian-discuss/2023-December/010013.html
---
 lib/PublicInbox/LeiInspect.pm |  1 -
 lib/PublicInbox/Search.pm     | 19 ++++++++++---------
 lib/PublicInbox/SearchIdx.pm  | 13 ++++++-------
 lib/PublicInbox/xh_cidx.h     | 15 +++++----------
 lib/PublicInbox/xh_mset.h     |  2 +-
 5 files changed, 22 insertions(+), 28 deletions(-)

diff --git a/lib/PublicInbox/LeiInspect.pm b/lib/PublicInbox/LeiInspect.pm
index d4ad03eb..88d7949c 100644
--- a/lib/PublicInbox/LeiInspect.pm
+++ b/lib/PublicInbox/LeiInspect.pm
@@ -97,7 +97,6 @@ sub _inspect_doc ($$) {
 		my $term = ($1 // '');
 		push @{$ent->{terms}->{$term}}, $tn;
 	}
-	@$_ = sort(@$_) for values %{$ent->{terms} // {}};
 	$cur = $doc->values_begin;
 	$end = $doc->values_end;
 	for (; $cur != $end; $cur++) {
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 8ef17d58..678c8c5d 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -614,16 +614,16 @@ sub get_pct ($) { # mset item
 
 sub xap_terms ($$;@) {
 	my ($pfx, $xdb_or_doc, @docid) = @_; # @docid may be empty ()
-	my %ret;
 	my $end = $xdb_or_doc->termlist_end(@docid);
 	my $cur = $xdb_or_doc->termlist_begin(@docid);
+	$cur->skip_to($pfx);
+	my (@ret, $tn);
+	my $pfxlen = length($pfx);
 	for (; $cur != $end; $cur++) {
-		$cur->skip_to($pfx);
-		last if $cur == $end;
-		my $tn = $cur->get_termname;
-		$ret{substr($tn, length($pfx))} = undef if !index($tn, $pfx);
+		$tn = $cur->get_termname;
+		index($tn, $pfx) ? last : push(@ret, substr($tn, $pfxlen));
 	}
-	wantarray ? sort(keys(%ret)) : \%ret;
+	wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 # get combined docid from over.num:
@@ -638,11 +638,12 @@ sub all_terms {
 	my ($self, $pfx) = @_;
 	my $cur = xdb($self)->allterms_begin($pfx);
 	my $end = $self->{xdb}->allterms_end($pfx);
-	my %ret;
+	my $pfxlen = length($pfx);
+	my @ret;
 	for (; $cur != $end; $cur++) {
-		$ret{substr($cur->get_termname, length($pfx))} = undef;
+		push @ret, substr($cur->get_termname, $pfxlen);
 	}
-	wantarray ? (sort keys %ret) : \%ret;
+	wantarray ? @ret : +{ map { $_ => undef } @ret };
 }
 
 sub xh_args { # prep getopt args to feed to xap_helper.h socket
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 1bf471fc..1ac8e33e 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -42,7 +42,7 @@ my $BASE85 = qr/[a-zA-Z0-9\!\#\$\%\&\(\)\*\+\-;<=>\?\@\^_`\{\|\}\~]+/;
 my $xapianlevels = qr/\A(?:full|medium)\z/;
 my $hex = '[a-f0-9]';
 my $OID = $hex .'{40,}';
-my @VMD_MAP = (kw => 'K', L => 'L');
+my @VMD_MAP = (kw => 'K', L => 'L'); # value order matters
 our $INDEXLEVELS = qr/\A(?:full|medium|basic)\z/;
 
 sub new {
@@ -608,17 +608,16 @@ sub set_vmd {
 	my ($self, $docid, $vmd) = @_;
 	begin_txn_lazy($self);
 	my $doc = _get_doc($self, $docid) or return;
-	my ($end, @rm, @add);
+	my ($v, @rm, @add);
 	my @x = @VMD_MAP;
+	my ($cur, $end) = ($doc->termlist_begin, $doc->termlist_end);
 	while (my ($field, $pfx) = splice(@x, 0, 2)) {
 		my $set = $vmd->{$field} // next;
 		my %keep = map { $_ => 1 } @$set;
 		my %add = %keep;
-		$end //= $doc->termlist_end;
-		for (my $cur = $doc->termlist_begin; $cur != $end; $cur++) {
-			$cur->skip_to($pfx);
-			last if $cur == $end;
-			my $v = $cur->get_termname;
+		$cur->skip_to($pfx); # works due to @VMD_MAP order
+		for (; $cur != $end; $cur++) {
+			$v = $cur->get_termname;
 			$v =~ s/\A$pfx//s or next;
 			$keep{$v} ? delete($add{$v}) : push(@rm, $pfx.$v);
 		}
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 1980f9f6..2803b3a4 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -12,12 +12,9 @@ static void dump_ibx_term(struct req *req, const char *pfx,
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-
-		if (starts_with(&tn, pfx, pfx_len)) {
-			fprintf(req->fp[0], "%s %s\n",
-				tn.c_str() + pfx_len, ibx_id);
-			++req->nr_out;
-		}
+		if (!starts_with(&tn, pfx, pfx_len)) break;
+		fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
+		++req->nr_out;
 	}
 }
 
@@ -95,8 +92,7 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
 	fbuf_init(root_offs);
 	for (cur.skip_to("G"); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, "G", 1))
-			continue;
+		if (!starts_with(&tn, "G", 1)) break;
 		union { const char *in; char *out; } u;
 		u.in = tn.c_str() + 1;
 		e.key = u.out;
@@ -125,8 +121,7 @@ static void dump_roots_term(struct req *req, const char *pfx,
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, pfx, pfx_len))
-			continue;
+		if (!starts_with(&tn, pfx, pfx_len)) break;
 		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
 		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
 		++req->nr_out;
diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h
index 056fe22b..4e97a284 100644
--- a/lib/PublicInbox/xh_mset.h
+++ b/lib/PublicInbox/xh_mset.h
@@ -11,7 +11,7 @@ static void emit_doc_term(FILE *fp, const char *pfx, Xapian::Document *doc)
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
-		if (!starts_with(&tn, pfx, pfx_len)) continue;
+		if (!starts_with(&tn, pfx, pfx_len)) break;
 		fputc(0, fp);
 		fwrite(tn.data(), tn.size(), 1, fp);
 	}

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 2/5] *search: favor wantarray form of xap_terms
  2023-12-07 11:42 [PATCH 1/5] *search: simplify handling of Xapian term iterators Eric Wong
@ 2023-12-07 11:42 ` Eric Wong
  2023-12-07 11:42 ` [PATCH 3/5] xap_helper_cxx: drop chdir usage in build Eric Wong
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2023-12-07 11:42 UTC (permalink / raw)
  To: spew

Most xap_terms callers do not benefit from the hashref
return value, and we can delay hashmap use until
List::Util::uniqstr if needed.
---
 lib/PublicInbox/CodeSearch.pm | 15 ++++++---------
 lib/PublicInbox/LeiSearch.pm  | 17 +++++++----------
 lib/PublicInbox/LeiStore.pm   | 13 +++++++------
 3 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
index 3092718d..48697cdc 100644
--- a/lib/PublicInbox/CodeSearch.pm
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -9,6 +9,7 @@ use v5.12;
 use parent qw(PublicInbox::Search);
 use PublicInbox::Config;
 use PublicInbox::Search qw(retry_reopen int_val xap_terms);
+use PublicInbox::Compat qw(uniqstr);
 use Compress::Zlib qw(uncompress);
 use constant {
 	AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail)
@@ -199,12 +200,11 @@ sub roots2paths { # for diagnostics
 		do {
 			my $mset = $enq->get_mset($off += $size, $lim);
 			for my $x ($mset->items) {
-				my $tmp = xap_terms('P', $x->get_document);
-				push @$dirs, keys %$tmp;
+				push @$dirs, xap_terms('P', $x->get_document);
 			}
 			$size = $mset->size;
 		} while ($size);
-		@$dirs = sort @$dirs;
+		@$dirs = sort(uniqstr(@$dirs));
 	}
 	\%ret;
 }
@@ -223,12 +223,9 @@ sub root_oids ($$) {
 	my @ids = docids_of_git_dir $self, $git_dir or warn <<"";
 BUG? (non-fatal) `$git_dir' not indexed in $self->{topdir}
 
-	my %ret;
-	for my $docid (@ids) {
-		my @oids = xap_terms('G', $self->xdb, $docid);
-		@ret{@oids} = @oids;
-	}
-	sort keys %ret;
+	my @ret = map { xap_terms('G', $self->xdb, $_) } @ids;
+	@ret = uniqstr(@ret) if @ids > 1;
+	@ret;
 }
 
 sub paths2roots {
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index ba4c4309..29e3213f 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -9,6 +9,7 @@ use parent qw(PublicInbox::ExtSearch); # PublicInbox::Search->reopen
 use PublicInbox::Search qw(xap_terms);
 use PublicInbox::ContentHash qw(content_digest content_hash git_sha);
 use PublicInbox::MID qw(mids mids_for_index);
+use PublicInbox::Compat qw(uniqstr);
 use Carp qw(croak);
 
 sub _msg_kw { # retry_reopen callback
@@ -44,20 +45,16 @@ sub oidbin_keywords {
 sub _xsmsg_vmd { # retry_reopen
 	my ($self, $smsg, $want_label) = @_;
 	my $xdb = $self->xdb; # set {nshard};
-	my (%kw, %L, $doc, $x);
-	$kw{flagged} = 1 if delete($smsg->{lei_q_tt_flagged});
+	my (@kw, @L, $doc, $x);
+	@kw = qw(flagged) if delete($smsg->{lei_q_tt_flagged});
 	my @num = $self->over->blob_exists($smsg->{blob});
 	for my $num (@num) { # there should only be one...
 		$doc = $xdb->get_document($self->num2docid($num));
-		$x = xap_terms('K', $doc);
-		%kw = (%kw, %$x);
-		if ($want_label) { # JSON/JMAP only
-			$x = xap_terms('L', $doc);
-			%L = (%L, %$x);
-		}
+		push @kw, xap_terms('K', $doc);
+		push @L, xap_terms('L', $doc) if $want_label # JSON/JMAP only
 	}
-	$smsg->{kw} = [ sort keys %kw ] if scalar(keys(%kw));
-	$smsg->{L} = [ sort keys %L ] if scalar(keys(%L));
+	@{$smsg->{kw}} = sort(uniqstr(@kw)) if @kw;
+	@{$smsg->{L}} = uniqstr(@L) if @L;
 }
 
 # lookup keywords+labels for external messages
diff --git a/lib/PublicInbox/LeiStore.pm b/lib/PublicInbox/LeiStore.pm
index aebb85a9..a752174d 100644
--- a/lib/PublicInbox/LeiStore.pm
+++ b/lib/PublicInbox/LeiStore.pm
@@ -27,6 +27,7 @@ use PublicInbox::MDA;
 use PublicInbox::Spawn qw(spawn);
 use PublicInbox::MdirReader;
 use PublicInbox::LeiToMail;
+use PublicInbox::Compat qw(uniqstr);
 use File::Temp qw(tmpnam);
 use POSIX ();
 use IO::Handle (); # ->autoflush
@@ -341,15 +342,15 @@ sub _add_vmd ($$$$) {
 sub _docids_and_maybe_kw ($$) {
 	my ($self, $docids) = @_;
 	return $docids unless wantarray;
-	my $kw = {};
+	my (@kw, $idx, @tmp);
 	for my $num (@$docids) { # likely only 1, unless ContentHash changes
 		# can't use ->search->msg_keywords on uncommitted docs
-		my $idx = $self->{priv_eidx}->idx_shard($num);
-		my $tmp = eval { $idx->ipc_do('get_terms', 'K', $num) };
-		if ($@) { warn "#$num get_terms: $@" }
-		else { @$kw{keys %$tmp} = values(%$tmp) };
+		$idx = $self->{priv_eidx}->idx_shard($num);
+		@tmp = eval { $idx->ipc_do('get_terms', 'K', $num) };
+		$@ ? warn("#$num get_terms: $@") : push(@kw, @tmp);
 	}
-	($docids, [ sort keys %$kw ]);
+	@kw = sort(uniqstr(@kw)) if @$docids > 1;
+	($docids, \@kw);
 }
 
 sub _reindex_1 { # git->cat_async callback

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 3/5] xap_helper_cxx: drop chdir usage in build
  2023-12-07 11:42 [PATCH 1/5] *search: simplify handling of Xapian term iterators Eric Wong
  2023-12-07 11:42 ` [PATCH 2/5] *search: favor wantarray form of xap_terms Eric Wong
@ 2023-12-07 11:42 ` Eric Wong
  2023-12-07 11:42 ` [PATCH 4/5] makefile: add `check-build' target Eric Wong
  2023-12-07 11:42 ` [PATCH 5/5] xap_helper: support term length limit Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2023-12-07 11:42 UTC (permalink / raw)
  To: spew

While chdir simplifies path manipulation on our end, its use
falls over when PERL5LIB/@INC contains relative paths which need
to be made absolute.  It's fewer lines of code to get eliminate
chdir usage than it is to keep using relative paths in most
places.
---
 lib/PublicInbox/XapHelperCxx.pm | 19 +++++++------------
 1 file changed, 7 insertions(+), 12 deletions(-)

diff --git a/lib/PublicInbox/XapHelperCxx.pm b/lib/PublicInbox/XapHelperCxx.pm
index 1aa75f2a..17f988ee 100644
--- a/lib/PublicInbox/XapHelperCxx.pm
+++ b/lib/PublicInbox/XapHelperCxx.pm
@@ -60,17 +60,12 @@ sub build () {
 	}
 	require PublicInbox::CodeSearch;
 	require PublicInbox::Lock;
-	require PublicInbox::OnDestroy;
 	my ($prog) = ($bin =~ m!/([^/]+)\z!);
 	my $lk = PublicInbox::Lock->new("$dir/$prog.lock")->lock_for_scope;
 	write_file '>', "$dir/$prog.cpp", qq{#include "xap_helper.h"\n},
 			PublicInbox::Search::generate_cxx(),
 			PublicInbox::CodeSearch::generate_cxx();
 
-	opendir my $dh, '.';
-	my $restore = PublicInbox::OnDestroy->new(\&chdir, $dh);
-	chdir $dir;
-
 	# xap_modversion may be set by needs_rebuild
 	$xap_modversion //= xap_cfg('--modversion');
 	my $fl = xap_cfg(qw(--libs --cflags));
@@ -83,15 +78,15 @@ sub build () {
 	$^O eq 'netbsd' and $fl =~ s/(\A|[ \t])\-L([^ \t]+)([ \t]|\z)/
 				"$1-L$2 -Wl,-rpath=$2$3"/egsx;
 	my @xflags = split(' ', "$fl $xflags"); # ' ' awk-mode eats leading WS
-	my @cflags = grep(!/\A-(?:Wl|l|L)/, @xflags);
-	run_die([$cxx, '-c', "$prog.cpp", '-I', $srcpfx, @cflags]);
-	run_die([$cxx, '-o', "$prog.tmp", "$prog.o", @xflags]);
-	unlink "$prog.cpp", "$prog.o";
-	write_file '>', 'XFLAGS.tmp', $xflags, "\n";
-	write_file '>', 'xap_modversion.tmp', $xap_modversion, "\n";
+	my @cflags = ('-I', $srcpfx, grep(!/\A-(?:Wl|l|L)/, @xflags));
+	run_die([$cxx, '-o', "$dir/$prog.o", '-c', "$dir/$prog.cpp", @cflags]);
+	run_die([$cxx, '-o', "$dir/$prog.tmp", "$dir/$prog.o", @xflags]);
+	unlink "$dir/$prog.cpp", "$dir/$prog.o";
+	write_file '>', "$dir/XFLAGS.tmp", $xflags, "\n";
+	write_file '>', "$dir/xap_modversion.tmp", $xap_modversion, "\n";
 	undef $xap_modversion; # do we ever build() twice?
 	# not quite atomic, but close enough :P
-	rename("$_.tmp", $_) for ($prog, qw(XFLAGS xap_modversion));
+	rename("$dir/$_.tmp", "$dir/$_") for ($prog, qw(XFLAGS xap_modversion));
 }
 
 sub check_build () {

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 4/5] makefile: add `check-build' target
  2023-12-07 11:42 [PATCH 1/5] *search: simplify handling of Xapian term iterators Eric Wong
  2023-12-07 11:42 ` [PATCH 2/5] *search: favor wantarray form of xap_terms Eric Wong
  2023-12-07 11:42 ` [PATCH 3/5] xap_helper_cxx: drop chdir usage in build Eric Wong
@ 2023-12-07 11:42 ` Eric Wong
  2023-12-07 11:42 ` [PATCH 5/5] xap_helper: support term length limit Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2023-12-07 11:42 UTC (permalink / raw)
  To: spew

A quick build check can detect bugs more quickly normal runtime
tests.
---
 Makefile.PL | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/Makefile.PL b/Makefile.PL
index 28f8263e..2b2e6b18 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -34,6 +34,19 @@ my @syn = (@EXE_FILES, grep(m!^lib/.*\.pm$!, @manifest), @scripts);
 @syn = grep(!/SaPlugin/, @syn) if !eval { require Mail::SpamAssasin };
 $v->{syn_files} = \@syn;
 $v->{my_syntax} = [map { "$_.syntax" } @syn];
+my %native = (
+	XapHelperCxx => [ qw(xh_cidx.h xh_mset.h xap_helper.h) ],
+);
+my @ck_build;
+for my $m (sort keys %native) {
+	my $hdr = $native{$m};
+	my @dep = map { "lib/PublicInbox/$_" } ("$m.pm", @$hdr);
+	$t->{"$m.check_build: @dep"} = [ "\$(PERL) -w -I lib ".
+		"-MPublicInbox::$m -e PublicInbox::${m}::check_build" ];
+	push @ck_build, "$m.check_build";
+}
+$t->{"check-build: @ck_build"} = [];
+
 my @no_pod;
 $v->{-m1} = [ map {
 		my $x = (split('/'))[-1];

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* [PATCH 5/5] xap_helper: support term length limit
  2023-12-07 11:42 [PATCH 1/5] *search: simplify handling of Xapian term iterators Eric Wong
                   ` (2 preceding siblings ...)
  2023-12-07 11:42 ` [PATCH 4/5] makefile: add `check-build' target Eric Wong
@ 2023-12-07 11:42 ` Eric Wong
  3 siblings, 0 replies; 5+ messages in thread
From: Eric Wong @ 2023-12-07 11:42 UTC (permalink / raw)
  To: spew

This allows us to use p2q-compatible specifications such as
"dfpost7" to only capture blob OIDs which are 7 characters in
length (the indexer will always index down to 7 characters)
---
 lib/PublicInbox/XapHelper.pm | 24 +++++++++++++++---
 lib/PublicInbox/xap_helper.h | 11 ++++++++-
 lib/PublicInbox/xh_cidx.h    | 48 ++++++++++++++++++++++++++++++++----
 t/xap_helper.t               | 33 +++++++++++++++++++++++++
 4 files changed, 106 insertions(+), 10 deletions(-)

diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm
index b21e70a2..ed11a2f8 100644
--- a/lib/PublicInbox/XapHelper.pm
+++ b/lib/PublicInbox/XapHelper.pm
@@ -39,13 +39,24 @@ sub iter_retry_check ($) {
 	}
 }
 
+sub term_length_extract ($) {
+	my ($req) = @_;
+	@{$req->{A_len}} = map {
+		my $len = s/([0-9]+)\z// ? ($1 + 0) : undef;
+		[ $_, $len ];
+	} @{$req->{A}};
+}
+
 sub dump_ibx_iter ($$$) {
 	my ($req, $ibx_id, $it) = @_;
 	my $out = $req->{0};
 	eval {
 		my $doc = $it->get_document;
-		for my $p (@{$req->{A}}) {
-			for (xap_terms($p, $doc)) {
+		for my $pair (@{$req->{A_len}}) {
+			my ($pfx, $len) = @$pair;
+			my @t = xap_terms($pfx, $doc);
+			@t = grep { length == $len } @t if defined($len);
+			for (@t) {
 				print $out "$_ $ibx_id\n" or die "print: $!";
 				++$req->{nr_out};
 			}
@@ -64,6 +75,7 @@ sub cmd_dump_ibx {
 	my ($req, $ibx_id, $qry_str) = @_;
 	$qry_str // die 'usage: dump_ibx [OPTIONS] IBX_ID QRY_STR';
 	$req->{A} or die 'dump_ibx requires -A PREFIX';
+	term_length_extract $req;
 	my $max = $req->{'m'} // $req->{srch}->{xdb}->get_doccount;
 	my $opt = { relevance => -1, limit => $max, offset => $req->{o} // 0 };
 	$opt->{eidx_key} = $req->{O} if defined $req->{O};
@@ -82,8 +94,11 @@ sub dump_roots_iter ($$$) {
 	eval {
 		my $doc = $it->get_document;
 		my $G = join(' ', map { $root2off->{$_} } xap_terms('G', $doc));
-		for my $p (@{$req->{A}}) {
-			for (xap_terms($p, $doc)) {
+		for my $pair (@{$req->{A_len}}) {
+			my ($pfx, $len) = @$pair;
+			my @t = xap_terms($pfx, $doc);
+			@t = grep { length == $len } @t if defined($len);
+			for (@t) {
 				$req->{wbuf} .= "$_ $G\n";
 				++$req->{nr_out};
 			}
@@ -106,6 +121,7 @@ sub cmd_dump_roots {
 	my ($req, $root2off_file, $qry_str) = @_;
 	$qry_str // die 'usage: dump_roots [OPTIONS] ROOT2ID_FILE QRY_STR';
 	$req->{A} or die 'dump_roots requires -A PREFIX';
+	term_length_extract $req;
 	open my $fh, '<', $root2off_file;
 	my $root2off; # record format: $OIDHEX "\0" uint32_t
 	my @x = split(/\0/, read_all $fh);
diff --git a/lib/PublicInbox/xap_helper.h b/lib/PublicInbox/xap_helper.h
index 1f8c426b..3456910b 100644
--- a/lib/PublicInbox/xap_helper.h
+++ b/lib/PublicInbox/xap_helper.h
@@ -123,6 +123,7 @@ typedef bool (*cmd)(struct req *);
 struct req { // argv and pfxv point into global rbuf
 	char *argv[MY_ARG_MAX];
 	char *pfxv[MY_ARG_MAX]; // -A <prefix>
+	size_t *lenv; // -A <prefix>LENGTH
 	struct srch *srch;
 	char *Pgit_dir;
 	char *Oeidx_key;
@@ -727,6 +728,13 @@ static void sigw(int sig) // SIGTERM handler for worker
 	sock_fd = -1; // break out of recv_loop
 }
 
+#define CLEANUP_REQ __attribute__((__cleanup__(req_cleanup)))
+static void req_cleanup(void *ptr)
+{
+	struct req *req = (struct req *)ptr;
+	free(req->lenv);
+}
+
 static void recv_loop(void) // worker process loop
 {
 	static char rbuf[4096 * 33]; // per-process
@@ -737,7 +745,8 @@ static void recv_loop(void) // worker process loop
 
 	while (sock_fd == 0) {
 		size_t len = sizeof(rbuf);
-		struct req req = {};
+		CLEANUP_REQ struct req req = {};
+
 		if (!recv_req(&req, rbuf, &len))
 			continue;
 		if (req.fp[1])
diff --git a/lib/PublicInbox/xh_cidx.h b/lib/PublicInbox/xh_cidx.h
index 2803b3a4..311ca05f 100644
--- a/lib/PublicInbox/xh_cidx.h
+++ b/lib/PublicInbox/xh_cidx.h
@@ -3,16 +3,49 @@
 // This file is only intended to be included by xap_helper.h
 // it implements pieces used by CodeSearchIdx.pm
 
-static void dump_ibx_term(struct req *req, const char *pfx,
+static void term_length_extract(struct req *req)
+{
+	req->lenv = (size_t *)calloc(req->pfxc, sizeof(size_t));
+	if (!req->lenv)
+		EABORT("lenv = calloc(%d %zu)", req->pfxc, sizeof(size_t));
+	for (int i = 0; i < req->pfxc; i++) {
+		char *pfx = req->pfxv[i];
+		// extract trailing digits as length:
+		// $len = s/([0-9]+)\z// ? ($1+0) : 0
+		for (size_t j = 0; pfx[j]; j++) {
+			if (pfx[j] < '0' || pfx[j] > '9')
+				continue;
+			if (j == 0) {
+				warnx("W: `%s' not a valid prefix", pfx);
+				continue;
+			}
+			char *end;
+			unsigned long long tmp = strtoull(pfx + j, &end, 10);
+			if (*end || tmp >= (unsigned long long)SIZE_MAX) {
+				warnx("W: `%s' not recognized", pfx);
+			} else {
+				req->lenv[i] = (size_t)tmp;
+				pfx[j] = 0;
+				break;
+			}
+		}
+	}
+}
+
+static void dump_ibx_term(struct req *req, int p,
 			Xapian::Document *doc, const char *ibx_id)
 {
 	Xapian::TermIterator cur = doc->termlist_begin();
 	Xapian::TermIterator end = doc->termlist_end();
+	const char *pfx = req->pfxv[p];
 	size_t pfx_len = strlen(pfx);
+	size_t term_len = req->lenv[p];
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
 		if (!starts_with(&tn, pfx, pfx_len)) break;
+		if (term_len > 0 && (tn.length() - pfx_len) != term_len)
+			continue;
 		fprintf(req->fp[0], "%s %s\n", tn.c_str() + pfx_len, ibx_id);
 		++req->nr_out;
 	}
@@ -24,7 +57,7 @@ static enum exc_iter dump_ibx_iter(struct req *req, const char *ibx_id,
 	try {
 		Xapian::Document doc = i->get_document();
 		for (int p = 0; p < req->pfxc; p++)
-			dump_ibx_term(req, req->pfxv[p], &doc, ibx_id);
+			dump_ibx_term(req, p, &doc, ibx_id);
 	} catch (const Xapian::DatabaseModifiedError & e) {
 		req->srch->db->reopen();
 		return ITER_RETRY;
@@ -46,6 +79,7 @@ static bool cmd_dump_ibx(struct req *req)
 		EABORT("setlinebuf(fp[0])"); // WTF?
 	req->asc = true;
 	req->sort_col = -1;
+	term_length_extract(req);
 	Xapian::MSet mset = mail_mset(req, req->argv[optind + 1]);
 
 	// @UNIQ_FOLD in CodeSearchIdx.pm can handle duplicate lines fine
@@ -110,18 +144,22 @@ static bool root2offs_str(struct fbuf *root_offs, Xapian::Document *doc)
 
 // writes term values matching @pfx for a given @doc, ending the line
 // with the contents of @root_offs
-static void dump_roots_term(struct req *req, const char *pfx,
+static void dump_roots_term(struct req *req, int p,
 				struct dump_roots_tmp *drt,
 				struct fbuf *root_offs,
 				Xapian::Document *doc)
 {
 	Xapian::TermIterator cur = doc->termlist_begin();
 	Xapian::TermIterator end = doc->termlist_end();
+	const char *pfx = req->pfxv[p];
 	size_t pfx_len = strlen(pfx);
+	size_t term_len = req->lenv[p];
 
 	for (cur.skip_to(pfx); cur != end; cur++) {
 		std::string tn = *cur;
 		if (!starts_with(&tn, pfx, pfx_len)) break;
+		if (term_len > 0 && (tn.length() - pfx_len) != term_len)
+			continue;
 		fputs(tn.c_str() + pfx_len, drt->wbuf.fp);
 		fwrite(root_offs->ptr, root_offs->len, 1, drt->wbuf.fp);
 		++req->nr_out;
@@ -163,8 +201,7 @@ static enum exc_iter dump_roots_iter(struct req *req,
 		if (!root2offs_str(&root_offs, &doc))
 			return ITER_ABORT; // bad request, abort
 		for (int p = 0; p < req->pfxc; p++)
-			dump_roots_term(req, req->pfxv[p], drt,
-					&root_offs, &doc);
+			dump_roots_term(req, p, drt, &root_offs, &doc);
 	} catch (const Xapian::DatabaseModifiedError & e) {
 		req->srch->db->reopen();
 		return ITER_RETRY;
@@ -217,6 +254,7 @@ static bool cmd_dump_roots(struct req *req)
 	req->asc = true;
 	req->sort_col = -1;
 	Xapian::MSet mset = commit_mset(req, req->argv[optind + 1]);
+	term_length_extract(req);
 
 	fbuf_init(&drt.wbuf);
 
diff --git a/t/xap_helper.t b/t/xap_helper.t
index ec78998c..be010c75 100644
--- a/t/xap_helper.t
+++ b/t/xap_helper.t
@@ -241,6 +241,39 @@ for my $n (@NO_CXX) {
 				"#$docid $pfx as expected ($xhc->{impl})";
 		}
 	}
+	my $nr;
+	for my $i (7, 8, 39, 40) {
+		pipe($err_r, $err_w);
+		$r = $xhc->mkreq([ undef, $err_w ], qw(dump_roots -c -A),
+				"XDFPOST$i", (map { ('-d', $_) } @int),
+				$root2id_file, 'dt:19700101'.'000000..');
+		close $err_w;
+		@res = <$r>;
+		my @err = <$err_r>;
+		if (defined $nr) {
+			is scalar(@res), $nr,
+				"got expected results ($xhc->{impl})";
+		} else {
+			$nr //= scalar @res;
+			ok $nr, "got initial results ($xhc->{impl})";
+		}
+		my @oids = (join('', @res) =~ /^([a-f0-9]+) /gms);
+		is_deeply [grep { length == $i } @oids], \@oids,
+			"all OIDs match expected length ($xhc->{impl})";
+		my ($nr_out) = ("@err" =~ /nr_out=(\d+)/);
+		is $nr_out, scalar(@oids), "output count matches $xhc->{impl}"
+			or diag explain(\@res, \@err);
+	}
+	pipe($err_r, $err_w);
+	$r = $xhc->mkreq([ undef, $err_w ], qw(dump_ibx -A XDFPOST7),
+			@ibx_shard_args, qw(13 rt:0..));
+	close $err_w;
+	@res = <$r>;
+	my @err = <$err_r>;
+	my ($nr_out) = ("@err" =~ /nr_out=(\d+)/);
+	my @oids = (join('', @res) =~ /^([a-f0-9]{7}) /gms);
+	is $nr_out, scalar(@oids), "output count matches $xhc->{impl}" or
+		diag explain(\@res, \@err);
 }
 
 done_testing;

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2023-12-07 11:42 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-12-07 11:42 [PATCH 1/5] *search: simplify handling of Xapian term iterators Eric Wong
2023-12-07 11:42 ` [PATCH 2/5] *search: favor wantarray form of xap_terms Eric Wong
2023-12-07 11:42 ` [PATCH 3/5] xap_helper_cxx: drop chdir usage in build Eric Wong
2023-12-07 11:42 ` [PATCH 4/5] makefile: add `check-build' target Eric Wong
2023-12-07 11:42 ` [PATCH 5/5] xap_helper: support term length limit Eric Wong

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).