From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on dcvr.yhbt.net X-Spam-Level: X-Spam-ASN: X-Spam-Status: No, score=-4.2 required=3.0 tests=ALL_TRUSTED,AWL,BAYES_00, DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF shortcircuit=no autolearn=ham autolearn_force=no version=3.4.6 Received: from localhost (dcvr.yhbt.net [127.0.0.1]) by dcvr.yhbt.net (Postfix) with ESMTP id 69E9D1F44D for ; Fri, 19 Apr 2024 01:18:01 +0000 (UTC) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; d=80x24.org; s=selector1; t=1713489481; bh=3lZzd/XV9K09WAAOAGUpoReySwUf8uD4nIFdj8fo4Jg=; h=From:To:Subject:Date:From; b=J9CgJ7Ena+U667GqTaxEPe+mJAMYvq6EbH0k619ZEmuOSAMK9gC0lLhYlEYx9pNbF 5XHe3tSAxpBN+zhV1M/aBBEaJYf9bLtA2p1bZvuqGGTLMchN9vxJa2YpD13lHDqfNk sEd3qGxVcpgbU3deO8VQZ9iLjjEoCgbIaEeyRpyE= From: Eric Wong To: spew@80x24.org Subject: [PATCH] xap_helper: drop terms+data from `mset' command Date: Fri, 19 Apr 2024 01:18:01 +0000 Message-ID: <20240419011801.4050181-1-e@80x24.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit List-Id: Retrieving Xapian document terms, data (and possibly values) and transferring to the Perl side would be an increase in complexity on both the Perl and C++ sides. It would require more I/O and transient memory use on the Perl side, so lets ignore the document-related stuff here for now for ease-of-development. We can reconsider this change if dropping Xapian Perl bindings entirely and relying on JAOT C++ ever becomes a possibility. --- lib/PublicInbox/XapHelper.pm | 5 ----- lib/PublicInbox/xh_mset.h | 10 ---------- t/cindex.t | 17 +++++++++-------- t/xap_helper.t | 21 +++++---------------- 4 files changed, 14 insertions(+), 39 deletions(-) diff --git a/lib/PublicInbox/XapHelper.pm b/lib/PublicInbox/XapHelper.pm index 143853cd..ea13dce1 100644 --- a/lib/PublicInbox/XapHelper.pm +++ b/lib/PublicInbox/XapHelper.pm @@ -151,11 +151,6 @@ sub mset_iter ($$) { my $buf = $it->get_docid; $buf .= "\0".$it->get_percent if $req->{p}; $buf .= "\0".$it->get_rank if $req->{R}; - my $doc = ($req->{A} || $req->{D}) ? $it->get_document : undef; - for my $p (@{$req->{A}}) { - $buf .= "\0".$p.$_ for xap_terms($p, $doc); - } - $buf .= "\0".$doc->get_data if $req->{D}; say { $req->{0} } $buf; }; $@ ? iter_retry_check($req) : 0; diff --git a/lib/PublicInbox/xh_mset.h b/lib/PublicInbox/xh_mset.h index 742b7811..69921687 100644 --- a/lib/PublicInbox/xh_mset.h +++ b/lib/PublicInbox/xh_mset.h @@ -27,16 +27,6 @@ static enum exc_iter mset_iter(const struct req *req, FILE *fp, off_t off, if (req->emit_rank) fprintf(fp, "%c%llu", 0, (unsigned long long)i->get_rank()); - if (req->pfxc || req->emit_docdata) { - Xapian::Document doc = i->get_document(); - for (int p = 0; p < req->pfxc; p++) - emit_doc_term(fp, req->pfxv[p], &doc); - if (req->emit_docdata) { - std::string d = doc.get_data(); - fputc(0, fp); - fwrite(d.data(), d.size(), 1, fp); - } - } fputc('\n', fp); } catch (const Xapian::DatabaseModifiedError & e) { req->srch->db->reopen(); diff --git a/t/cindex.t b/t/cindex.t index d3e79197..aabddca8 100644 --- a/t/cindex.t +++ b/t/cindex.t @@ -147,17 +147,18 @@ if ('multi-repo search') { my $test_xhc = sub { my ($xhc) = @_; + my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); my $impl = $xhc->{impl}; my ($r, @l); - $r = $xhc->mkreq([], qw(mset -D -c -g), $zp_git, @xh_args, 'NUL'); + $r = $xhc->mkreq([], qw(mset -c -g), $zp_git, @xh_args, 'NUL'); chomp(@l = <$r>); like shift(@l), qr/\bmset\.size=2\b/, "got expected header $impl"; my %docid2data; my @got = sort map { - my @f = split /\0/; - is scalar(@f), 2, 'got 2 entries'; - $docid2data{$f[0]} = $f[1]; - $f[1]; + my ($docid, @extra) = split /\0/; + is scalar(@extra), 0, 'no extra fields'; + $docid2data{$docid} = + $csrch->xdb->get_document($docid)->get_data; } @l; is_deeply(\@got, $exp, "expected doc_data $impl"); @@ -166,7 +167,6 @@ my $test_xhc = sub { like shift(@l), qr/\bmset.size=0\b/, "got miss in wrong dir $impl"; is_deeply(\@l, [], "no extra lines $impl"); - my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); while (my ($did, $expect) = each %docid2data) { is_deeply($csrch->xdb->get_document($did)->get_data, $expect, "docid=$did data matches"); @@ -179,14 +179,15 @@ SKIP: { require_mods('+SCM_RIGHTS', 1); require PublicInbox::XapClient; my $xhc = PublicInbox::XapClient::start_helper('-j0'); - $test_xhc->($xhc); + my $csrch = PublicInbox::CodeSearch->new("$tmp/ext"); + $test_xhc->($xhc, $csrch); skip 'PI_NO_CXX set', 1 if $ENV{PI_NO_CXX}; $xhc->{impl} =~ /Cxx/ or skip 'C++ compiler or xapian development libs missing', 1; skip 'TEST_XH_CXX_ONLY set', 1 if $ENV{TEST_XH_CXX_ONLY}; local $ENV{PI_NO_CXX} = 1; # force XS or SWIG binding test $xhc = PublicInbox::XapClient::start_helper('-j0'); - $test_xhc->($xhc); + $test_xhc->($xhc, $csrch); } if ('--update') { diff --git a/t/xap_helper.t b/t/xap_helper.t index c2fec6fc..d1394090 100644 --- a/t/xap_helper.t +++ b/t/xap_helper.t @@ -204,7 +204,7 @@ for my $n (@NO_CXX) { $err = do { local $/; <$err_r> }; is $err, "mset.size=6 nr_out=5\n", "got expected status ($xhc->{impl})"; - $r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args, + $r = $xhc->mkreq([], qw(mset -p), @ibx_shard_args, 'dfn:lib/PublicInbox/Search.pm'); chomp((my $hdr, @res) = readline($r)); like $hdr, qr/\bmset\.size=1\b/, @@ -213,15 +213,14 @@ for my $n (@NO_CXX) { @res = split /\0/, $res[0]; { my $doc = $v2->search->xdb->get_document($res[0]); + ok $doc, 'valid document retrieved'; my @q = PublicInbox::Search::xap_terms('Q', $doc); is_deeply \@q, [ $mid ], 'docid usable'; } ok $res[1] > 0 && $res[1] <= 100, 'pct > 0 && <= 100'; - is $res[2], 'XDFID'.$dfid, 'XDFID result matches'; - is $res[3], 'Q'.$mid, 'Q (msgid) mset result matches'; - is scalar(@res), 4, 'only 4 columns in result'; + is scalar(@res), 2, 'only 2 columns in result'; - $r = $xhc->mkreq([], qw(mset -p -A XDFID -A Q), @ibx_shard_args, + $r = $xhc->mkreq([], qw(mset -p), @ibx_shard_args, 'dt:19700101'.'000000..'); chomp(($hdr, @res) = readline($r)); like $hdr, qr/\bmset\.size=6\b/, @@ -232,17 +231,7 @@ for my $n (@NO_CXX) { my $doc = $v2->search->xdb->get_document($docid); ok $pct > 0 && $pct <= 100, "pct > 0 && <= 100 #$docid ($xhc->{impl})"; - my %terms; - for (@rest) { - s/\A([A-Z]+)// or xbail 'no prefix=', \@rest; - push @{$terms{$1}}, $_; - } - while (my ($pfx, $vals) = each %terms) { - @$vals = sort @$vals; - my @q = PublicInbox::Search::xap_terms($pfx, $doc); - is_deeply $vals, \@q, - "#$docid $pfx as expected ($xhc->{impl})"; - } + is scalar(@rest), 0, 'no extra rows returned'; } my $nr; for my $i (7, 8, 39, 40) {