about summary refs log tree commit homepage
path: root/t/cindex.t
diff options
context:
space:
mode:
Diffstat (limited to 't/cindex.t')
-rw-r--r--t/cindex.t302
1 files changed, 302 insertions, 0 deletions
diff --git a/t/cindex.t b/t/cindex.t
new file mode 100644
index 00000000..90236287
--- /dev/null
+++ b/t/cindex.t
@@ -0,0 +1,302 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use PublicInbox::TestCommon;
+use Cwd qw(getcwd);
+use List::Util qw(sum);
+use autodie qw(close mkdir open rename);
+require_mods(qw(json Xapian +SCM_RIGHTS));
+use_ok 'PublicInbox::CodeSearchIdx';
+use PublicInbox::Import;
+my ($tmp, $for_destroy) = tmpdir();
+my $pwd = getcwd();
+my @unused_keys = qw(last_commit has_threadid skip_docdata);
+local $ENV{PI_CONFIG} = '/dev/null';
+# local $ENV{TAIL_ALL} = $ENV{TAIL_ALL} // 1; # while features are unstable
+my $opt = { 1 => \(my $cidx_out), 2 => \(my $cidx_err) };
+
+# I reworked CodeSearchIdx->shard_worker to handle empty trees
+# in the initial commit generated by cvs2svn for xapian.git
+create_coderepo 'empty-tree-root-0600', tmpdir => "$tmp/wt0", sub {
+        xsys_e([qw(/bin/sh -c), <<'EOM']);
+git init -q &&
+git config core.sharedRepository 0600
+tree=$(git mktree </dev/null) &&
+head=$(git symbolic-ref HEAD) &&
+cmt=$(echo 'empty root' | git commit-tree $tree) &&
+git update-ref $head $cmt &&
+echo hi >f &&
+git add f &&
+git commit -q -m hi &&
+git gc -q
+EOM
+}; # /create_coderepo
+
+ok(run_script([qw(-cindex --dangerous -q -g), "$tmp/wt0"]), 'cindex internal');
+{
+        my $exists = -e "$tmp/wt0/.git/public-inbox-cindex/cidx.lock";
+        my @st = stat(_);
+        ok($exists, 'internal dir created');
+        is($st[2] & 0600, 0600, 'mode respects core.sharedRepository');
+        @st = stat("$tmp/wt0/.git/public-inbox-cindex");
+        is($st[2] & 0700, 0700, 'dir mode respects core.sharedRepository');
+}
+
+# it's possible for git to emit NUL characters in diffs
+# (see c4201214cbf10636e2c1ab9131573f735b42c8d4 in linux.git)
+my $zp = create_coderepo 'NUL in patch', sub {
+        my $src = PublicInbox::IO::try_cat("$pwd/COPYING");
+        xsys_e([qw(git init -q)]);
+
+        # needs to be further than FIRST_FEW_BYTES (8000) in git.git
+        $src =~ s/\b(Limitation of Liability\.)\n\n/$1\n\0\n/s or
+                xbail "BUG: no `\\n\\n' in $pwd/COPYING";
+
+        PublicInbox::IO::write_file '>', 'f', $src;
+        xsys_e([qw(/bin/sh -c), <<'EOM']);
+git add f &&
+git commit -q -m 'initial with NUL character'
+EOM
+        $src =~ s/\n\0\n/\n\n/ or xbail "BUG: no `\\n\\0\\n'";
+        PublicInbox::IO::write_file '>', 'f', $src;
+        xsys_e([qw(/bin/sh -c), <<'EOM']);
+git add f &&
+git commit -q -m 'remove NUL character' &&
+git gc -q
+EOM
+}; # /create_coderepo
+
+$zp = File::Spec->rel2abs($zp);
+ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext",
+                '-g', $zp, '-g', "$tmp/wt0" ]),
+        'cindex external');
+ok(-e "$tmp/ext/cidx.lock", 'external dir created');
+ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo');
+
+ok(run_script([qw(-cindex -L medium --dangerous -q -d),
+        "$tmp/med", '-g', $zp, '-g', "$tmp/wt0"]), 'cindex external medium');
+
+
+SKIP: {
+        have_xapian_compact 2;
+        ok(run_script([qw(-compact -q), "$tmp/ext"]), 'compact on full');
+        ok(run_script([qw(-compact -q), "$tmp/med"]), 'compact on medium');
+}
+
+my $no_metadata_set = sub {
+        my ($i, $extra, $xdb) = @_;
+        for my $xdb (@$xdb) {
+                for my $k (@unused_keys, @$extra) {
+                        is($xdb->get_metadata($k) // '', '',
+                                "metadata $k unset in shard #$i");
+                }
+                ++$i;
+        }
+};
+
+{
+        my $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+        my $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+        ok($full_size > $mid_size, 'full size > mid size') or
+                diag "full=$full_size mid=$mid_size";
+        for my $l (qw(med ext)) {
+                ok(run_script([qw(-cindex -q --reindex -u -d), "$tmp/$l"]),
+                        "reindex $l");
+        }
+        $mid_size = sum(map { -s $_ } glob("$tmp/med/cidx*/*/*"));
+        $full_size = sum(map { -s $_ } glob("$tmp/ext/cidx*/*/*"));
+        ok($full_size > $mid_size, 'full size > mid size after reindex') or
+                diag "full=$full_size mid=$mid_size";
+        my $csrch = PublicInbox::CodeSearch->new("$tmp/med");
+        my ($xdb0, @xdb) = $csrch->xdb_shards_flat;
+        $no_metadata_set->(0, [], [ $xdb0 ]);
+        is($xdb0->get_metadata('indexlevel'), 'medium',
+                'indexlevel set in shard #0');
+        $no_metadata_set->(1, ['indexlevel'], \@xdb);
+
+        ok(run_script([qw(-cindex -q -L full --reindex -u -d), "$tmp/med"]),
+                'reindex medium as full');
+        @xdb = $csrch->xdb_shards_flat;
+        $no_metadata_set->(0, ['indexlevel'], \@xdb);
+}
+
+use_ok 'PublicInbox::CodeSearch';
+
+
+my @xh_args;
+my $exp = [ 'initial with NUL character', 'remove NUL character' ];
+my $zp_git = "$zp/.git";
+if ('multi-repo search') {
+        my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+        my $mset = $csrch->mset('NUL');
+        is(scalar($mset->items), 2, 'got results');
+        my @have = sort(map { $_->get_document->get_data } $mset->items);
+        is_xdeeply(\@have, $exp, 'got expected subjects');
+
+        $mset = $csrch->mset('NUL', { git_dir => "$tmp/wt0/.git" });
+        is(scalar($mset->items), 0, 'no results with other GIT_DIR');
+
+        $mset = $csrch->mset('NUL', { git_dir => $zp_git });
+        @have = sort(map { $_->get_document->get_data } $mset->items);
+        is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter');
+        my @xdb = $csrch->xdb_shards_flat;
+        $no_metadata_set->(0, ['indexlevel'], \@xdb);
+        @xh_args = $csrch->xh_args;
+}
+
+my $test_xhc = sub {
+        my ($xhc) = @_;
+        my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+        my $impl = $xhc->{impl};
+        my ($r, @l);
+        $r = $xhc->mkreq([], qw(mset -c -g), $zp_git, @xh_args, 'NUL');
+        chomp(@l = <$r>);
+        like shift(@l), qr/\bmset\.size=2\b/, "got expected header $impl";
+        my %docid2data;
+        my @got = sort map {
+                my ($docid, $pct, $rank, @extra) = split /\0/;
+                ok $pct >= 0 && $pct <= 100, 'pct in range';
+                ok $rank >= 0 && $rank <= 100000, 'rank ok';
+                is scalar(@extra), 0, 'no extra fields';
+                $docid2data{$docid} =
+                        $csrch->xdb->get_document($docid)->get_data;
+        } @l;
+        is_deeply(\@got, $exp, "expected doc_data $impl");
+
+        $r = $xhc->mkreq([], qw(mset -c -g), "$tmp/wt0/.git", @xh_args, 'NUL');
+        chomp(@l = <$r>);
+        like shift(@l), qr/\bmset.size=0\b/, "got miss in wrong dir $impl";
+        is_deeply(\@l, [], "no extra lines $impl");
+
+        while (my ($did, $expect) = each %docid2data) {
+                is_deeply($csrch->xdb->get_document($did)->get_data,
+                        $expect, "docid=$did data matches");
+        }
+        ok(!$xhc->{io}->close, "$impl close");
+        is($?, 66 << 8, "got EX_NOINPUT from $impl exit");
+};
+
+SKIP: {
+        require_mods('+SCM_RIGHTS', 1);
+        require PublicInbox::XapClient;
+        my $xhc = PublicInbox::XapClient::start_helper('-j0');
+        my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+        $test_xhc->($xhc, $csrch);
+        skip 'PI_NO_CXX set', 1 if $ENV{PI_NO_CXX};
+        $xhc->{impl} =~ /Cxx/ or
+                skip 'C++ compiler or xapian development libs missing', 1;
+        skip 'TEST_XH_CXX_ONLY set', 1 if $ENV{TEST_XH_CXX_ONLY};
+        local $ENV{PI_NO_CXX} = 1; # force XS or SWIG binding test
+        $xhc = PublicInbox::XapClient::start_helper('-j0');
+        $test_xhc->($xhc, $csrch);
+}
+
+if ('--update') {
+        my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+        my $mset = $csrch->mset('dfn:for-update');
+        is(scalar($mset->items), 0, 'no result before update');
+
+        my $e = \%PublicInbox::TestCommon::COMMIT_ENV;
+        xsys_e([qw(/bin/sh -c), <<'EOM'], $e, { -C => "$tmp/wt0" });
+>for-update && git add for-update && git commit -q -m updated
+EOM
+        ok(run_script([qw(-cindex -qu -d), "$tmp/ext"]), '-cindex -u');
+        $mset = $csrch->reopen->mset('dfn:for-update');
+        is(scalar($mset->items), 1, 'got updated result');
+
+        ok(run_script([qw(-cindex -qu --reindex -d), "$tmp/ext"]), 'reindex');
+        $mset = $csrch->reopen->mset('dfn:for-update');
+        is(scalar($mset->items), 1, 'same result after reindex');
+}
+
+SKIP: { # --prune
+        require_cmd($ENV{XAPIAN_DELVE} || 'xapian-delve', 1);
+        require_git v2.6, 1;
+        my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+        is(scalar($csrch->mset('s:hi')->items), 1, 'got hit');
+
+        rename("$tmp/wt0/.git", "$tmp/wt0/.giit");
+        ok(run_script([qw(-cindex -q --prune -d), "$tmp/ext"], undef, $opt),
+                'prune');
+        is(${$opt->{2}}, '', 'nothing in stderr') or diag explain($opt);
+        $csrch->reopen;
+        is(scalar($csrch->mset('s:hi')->items), 0, 'hit pruned');
+
+        rename("$tmp/wt0/.giit", "$tmp/wt0/.git");
+        ok(run_script([qw(-cindex -qu -d), "$tmp/ext"]), 'update');
+        $csrch->reopen;
+        is(scalar($csrch->mset('s:hi')->items), 0,
+                'hit stays pruned since GIT_DIR was previously pruned');
+        isnt(scalar($csrch->mset('s:NUL')->items), 0,
+                'prune did not clobber entire index');
+}
+
+File::Path::remove_tree("$tmp/ext");
+mkdir("$tmp/ext", 0707);
+ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", '-g', $zp]),
+        'external on existing dir');
+{
+        my @st = stat("$tmp/ext/cidx.lock");
+        is($st[2] & 0777, 0604, 'created lock respects odd permissions');
+}
+
+ok(run_script([qw(-xcpdb), "$tmp/ext"]), 'xcpdb upgrade');
+ok(run_script([qw(-xcpdb -R4), "$tmp/ext"]), 'xcpdb reshard');
+
+SKIP: {
+        have_xapian_compact 2;
+        ok(run_script([qw(-xcpdb -R2 --compact), "$tmp/ext"]),
+                'xcpdb reshard+compact');
+        ok(run_script([qw(-xcpdb --compact), "$tmp/ext"]), 'xcpdb compact');
+};
+
+SKIP: {
+        require_cmd('join', 1);
+        my $basic = create_inbox 'basic', indexlevel => 'basic', sub {
+                my ($im, $ibx) = @_;
+                $im->add(eml_load('t/plack-qp.eml'));
+        };
+        my $env = { PI_CONFIG => "$tmp/pi_config" };
+        PublicInbox::IO::write_file '>', $env->{PI_CONFIG}, <<EOM;
+[publicinbox "basictest"]
+        inboxdir = $basic->{inboxdir}
+        address = basic\@example.com
+EOM
+        my $cmd = [ qw(-cindex -u --all -d), "$tmp/ext",
+                '--join=aggressive,dt:19700101000000..now',
+                '-I', $basic->{inboxdir} ];
+        $cidx_out = $cidx_err = '';
+        ok(run_script($cmd, $env, $opt), 'join w/o search');
+        like($cidx_err, qr/W: \Q$basic->{inboxdir}\E not indexed for search/s,
+                'non-Xapian-enabled inbox noted');
+}
+
+# we need to support blank sections for a top-level repos
+# (e.g. <https://example.com/my-project>
+# git.kernel.org could use "pub" as section name, though, since all git repos
+# are currently under //git.kernel.org/pub/**/*
+{
+        mkdir(my $d = "$tmp/blanksection");
+        my $cfg = cfg_new($d, <<EOM);
+[cindex ""]
+        topdir = $tmp/ext
+        localprefix = $tmp
+EOM
+        my $csrch = $cfg->lookup_cindex('');
+        is ref($csrch), 'PublicInbox::CodeSearch', 'codesearch w/ blank name';
+        is_deeply $csrch->{localprefix}, [ "$tmp" ], 'localprefix respected';
+        my $nr = 0;
+        $cfg->each_cindex(sub {
+                my ($cs, @rest) = @_;
+                is $cs->{topdir}, $csrch->{topdir}, 'each_cindex works';
+                is_deeply \@rest, [ '.' ], 'got expected arg';
+                ++$nr;
+        }, '.');
+        is $nr, 1, 'iterated through cindices';
+        my $oid = 'dba13ed2ddf783ee8118c6a581dbf75305f816a3';
+        my $mset = $csrch->mset("dfpost:$oid");
+        is $mset->size, 1, 'got result from full OID search';
+}
+
+done_testing;