* [PATCH 02/10] search: relocate all_terms from lei_search
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 03/10] admin: hoist out resolve_git_dir Eric Wong
` (7 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
This will be used for code_search, too.
---
lib/PublicInbox/LeiSearch.pm | 14 --------------
lib/PublicInbox/Search.pm | 14 ++++++++++++++
2 files changed, 14 insertions(+), 14 deletions(-)
diff --git a/lib/PublicInbox/LeiSearch.pm b/lib/PublicInbox/LeiSearch.pm
index 936c2751..ba4c4309 100644
--- a/lib/PublicInbox/LeiSearch.pm
+++ b/lib/PublicInbox/LeiSearch.pm
@@ -158,20 +158,6 @@ sub kw_changed {
join("\0", @$new_kw_sorted) eq $cur_kw ? 0 : 1;
}
-sub all_terms {
- my ($self, $pfx) = @_;
- my $xdb = $self->xdb;
- my $cur = $xdb->allterms_begin($pfx);
- my $end = $xdb->allterms_end($pfx);
- my %ret;
- for (; $cur != $end; $cur++) {
- my $tn = $cur->get_termname;
- index($tn, $pfx) == 0 and
- $ret{substr($tn, length($pfx))} = undef;
- }
- wantarray ? (sort keys %ret) : \%ret;
-}
-
sub qparse_new {
my ($self) = @_;
my $qp = $self->SUPER::qparse_new; # PublicInbox::Search
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index e858729a..7aba2445 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -553,4 +553,18 @@ sub num2docid ($$) {
($num - 1) * $nshard + $num % $nshard + 1;
}
+sub all_terms {
+ my ($self, $pfx) = @_;
+ my $cur = xdb($self)->allterms_begin($pfx);
+ my $end = $self->{xdb}->allterms_end($pfx);
+ my %ret;
+ for (; $cur != $end; $cur++) {
+ my $tn = $cur->get_termname;
+ index($tn, $pfx) == 0 and
+ $ret{substr($tn, length($pfx))} = undef;
+ }
+ wantarray ? (sort keys %ret) : \%ret;
+}
+
+
1;
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 03/10] admin: hoist out resolve_git_dir
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
2023-03-16 20:01 ` [PATCH 02/10] search: relocate all_terms from lei_search Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 04/10] admin: ensure resolved GIT_DIR is absolute Eric Wong
` (6 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
We'll be using this for indexing git coderepos.
---
lib/PublicInbox/Admin.pm | 10 ++++++++--
1 file changed, 8 insertions(+), 2 deletions(-)
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index 11ea8f83..a3b41d99 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -69,13 +69,19 @@ sub resolve_inboxdir {
die "`$try' is not a directory\n";
}
}
+ my $dir = resolve_git_dir($cd);
+ $$ver = 1 if $ver;
+ $dir;
+}
+
+sub resolve_git_dir {
+ my ($cd) = @_;
# try v1 bare git dirs
my $cmd = [ qw(git rev-parse --git-dir) ];
my $fh = popen_rd($cmd, undef, {-C => $cd});
my $dir = do { local $/; <$fh> };
- close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $!\n";
+ close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $?\n";
chomp $dir;
- $$ver = 1 if $ver;
rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir);
}
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 04/10] admin: ensure resolved GIT_DIR is absolute
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
2023-03-16 20:01 ` [PATCH 02/10] search: relocate all_terms from lei_search Eric Wong
2023-03-16 20:01 ` [PATCH 03/10] admin: hoist out resolve_git_dir Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 05/10] test_common: create_inbox: use `$!' properly on mkdir failure Eric Wong
` (5 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
We'll also support the $base arg of File::Spec->rel2abs
since it should make codesearch indexing easier.
---
lib/PublicInbox/Admin.pm | 4 +++-
lib/PublicInbox/Config.pm | 2 +-
script/public-inbox-convert | 2 +-
3 files changed, 5 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/Admin.pm b/lib/PublicInbox/Admin.pm
index a3b41d99..53566dc6 100644
--- a/lib/PublicInbox/Admin.pm
+++ b/lib/PublicInbox/Admin.pm
@@ -82,7 +82,9 @@ sub resolve_git_dir {
my $dir = do { local $/; <$fh> };
close $fh or die "error in @$cmd (cwd:${\($cd // '.')}): $?\n";
chomp $dir;
- rel2abs_collapsed($dir eq '.' ? ($cd // $dir) : $dir);
+ # --absolute-git-dir requires git v2.13.0+
+ $dir = rel2abs_collapsed($dir, $cd) if $dir !~ m!\A/!;
+ $dir;
}
# for unconfigured inboxes
diff --git a/lib/PublicInbox/Config.pm b/lib/PublicInbox/Config.pm
index cdf06d85..905e632c 100644
--- a/lib/PublicInbox/Config.pm
+++ b/lib/PublicInbox/Config.pm
@@ -369,7 +369,7 @@ sub git_bool {
# is sufficient and doesn't leave "/.." or "/../"
sub rel2abs_collapsed {
require File::Spec;
- my $p = File::Spec->rel2abs($_[-1]);
+ my $p = File::Spec->rel2abs(@_);
return $p if substr($p, -3, 3) ne '/..' && index($p, '/../') < 0;
require Cwd;
Cwd::abs_path($p);
diff --git a/script/public-inbox-convert b/script/public-inbox-convert
index 42955a48..5f4f2020 100755
--- a/script/public-inbox-convert
+++ b/script/public-inbox-convert
@@ -75,7 +75,7 @@ if ($opt->{'index'}) {
}
local %ENV = (%$env, %ENV) if $env;
my $new = { %$old };
-$new->{inboxdir} = $cfg->rel2abs_collapsed($new_dir);
+$new->{inboxdir} = PublicInbox::Config::rel2abs_collapsed($new_dir);
$new->{version} = 2;
$new = PublicInbox::InboxWritable->new($new, { nproc => $opt->{jobs} });
$new->{-no_fsync} = 1 if !$opt->{fsync};
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 05/10] test_common: create_inbox: use `$!' properly on mkdir failure
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
` (2 preceding siblings ...)
2023-03-16 20:01 ` [PATCH 04/10] admin: ensure resolved GIT_DIR is absolute Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 06/10] codesearch: initial cut w/ -cindex tool Eric Wong
` (4 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
stat(2) may fail and set `$!', too, so we must stash it, first.
---
lib/PublicInbox/TestCommon.pm | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm
index 5807105a..ed28ac48 100644
--- a/lib/PublicInbox/TestCommon.pm
+++ b/lib/PublicInbox/TestCommon.pm
@@ -709,9 +709,9 @@ sub create_inbox ($$;@) {
my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!);
my $dir = "t/data-gen/$base.$ident-$db";
my $new = !-d $dir;
- if ($new) {
- mkdir $dir; # may race
- -d $dir or BAIL_OUT "$dir could not be created: $!";
+ if ($new && !mkdir($dir)) {
+ my $err = $!;
+ -d $dir or xbail "mkdir($dir): $err";
}
my $lk = bless { lock_path => "$dir/creat.lock" }, 'PublicInbox::Lock';
$opt{inboxdir} = File::Spec->rel2abs($dir);
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 06/10] codesearch: initial cut w/ -cindex tool
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
` (3 preceding siblings ...)
2023-03-16 20:01 ` [PATCH 05/10] test_common: create_inbox: use `$!' properly on mkdir failure Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 07/10] cindex: parallelize prep phases Eric Wong
` (3 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
It seems relying on root commits is a reasonable way to
deduplicate and handle repositories with common history.
I initially wanted to shoehorn this into extindex, but decided a
separate Xapian index layout capable of being both external to
handle many forks and internal (in $GIT_DIR/public-inbox-cindex)
for small projects is the right way to go.
Unlike most existing parts of public-inbox, this relies on
absolute paths of $GIT_DIR stored in the Xapian DB and does not
rely on the config file. We'll be relying on the config file to
map absolute paths to public URL paths for WWW.
---
MANIFEST | 4 +
lib/PublicInbox/CodeSearch.pm | 121 +++++++++
lib/PublicInbox/CodeSearchIdx.pm | 425 +++++++++++++++++++++++++++++++
lib/PublicInbox/MiscIdx.pm | 2 +-
lib/PublicInbox/Search.pm | 63 +++--
lib/PublicInbox/SearchIdx.pm | 88 ++++---
lib/PublicInbox/TestCommon.pm | 41 ++-
lib/PublicInbox/ViewVCS.pm | 2 +-
script/public-inbox-cindex | 75 ++++++
t/cindex.t | 98 +++++++
10 files changed, 850 insertions(+), 69 deletions(-)
create mode 100644 lib/PublicInbox/CodeSearch.pm
create mode 100644 lib/PublicInbox/CodeSearchIdx.pm
create mode 100755 script/public-inbox-cindex
create mode 100644 t/cindex.t
diff --git a/MANIFEST b/MANIFEST
index bc652e21..40535233 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -161,6 +161,8 @@ lib/PublicInbox/AltId.pm
lib/PublicInbox/AutoReap.pm
lib/PublicInbox/Cgit.pm
lib/PublicInbox/CmdIPC4.pm
+lib/PublicInbox/CodeSearch.pm
+lib/PublicInbox/CodeSearchIdx.pm
lib/PublicInbox/CompressNoop.pm
lib/PublicInbox/Config.pm
lib/PublicInbox/ConfigIter.pm
@@ -363,6 +365,7 @@ sa_config/README
sa_config/root/etc/spamassassin/public-inbox.pre
sa_config/user/.spamassassin/user_prefs
script/lei
+script/public-inbox-cindex
script/public-inbox-clone
script/public-inbox-compact
script/public-inbox-convert
@@ -402,6 +405,7 @@ t/altid.t
t/altid_v2.t
t/cgi.t
t/check-www-inbox.perl
+t/cindex.t
t/clone-coderepo-puh1.sh
t/clone-coderepo-puh2.sh
t/clone-coderepo.psgi
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
new file mode 100644
index 00000000..1dfc124f
--- /dev/null
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -0,0 +1,121 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# read-only external index for coderepos
+# currently, it only indexes commits and repository metadata
+# (pathname, root commits); not blob contents
+package PublicInbox::CodeSearch;
+use v5.12;
+use parent qw(PublicInbox::Search);
+use PublicInbox::Search qw(retry_reopen int_val xap_terms);
+use constant {
+ AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail)
+ CT => 1, # commit time (Unix time stamp, like TS/rt: in mail)
+ CIDX_SCHEMA_VER => 1, # brand new schema for code search
+ # for repos (`Tr'), CT(col=1) is used for the latest tip commit time
+ # in refs/{heads,tags}. AT(col=0) may be used to store disk usage
+ # in the future, but disk usage calculation is espensive w/ alternates
+};
+
+# note: the non-X term prefix allocations are shared with Xapian omega,
+# see xapian-applications/omega/docs/termprefixes.rst
+# bool_pfx_internal:
+# type => 'T', # 'c' - commit, 'r' - repo GIT_DIR
+# tags are not indexed, only normal branches (refs/heads/*), not hidden
+# 'P' # (pathname) GIT_DIR # uniq
+# 'G' # (group) root commit (may have multiple roots)
+my %bool_pfx_external = (
+ oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256)
+ # type:repo - rel2abs_collapsed(GIT_DIR)
+ parent => 'XP',
+ %PublicInbox::Search::PATCH_BOOL_COMMON,
+);
+
+my %prob_prefix = ( # copied from PublicInbox::Search
+ # do we care about committer? or partial commit OID via Xapian?
+ # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial
+ %PublicInbox::Search::PATCH_PROB_COMMON,
+
+ # default:
+ '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY
+);
+
+sub new {
+ my ($cls, $dir) = @_;
+ bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER }, $cls;
+}
+
+sub cqparse_new ($) {
+ my ($self) = @_;
+ my $qp = $self->qp_init_common;
+ my $cb = $qp->can('add_valuerangeprocessor') //
+ $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
+ $cb->($qp, $PublicInbox::Search::NVRP->new(AT, 'd:')); # mairix compat
+ $cb->($qp, $PublicInbox::Search::NVRP->new(AT, 'dt:')); # mail compat
+ $cb->($qp, $PublicInbox::Search::NVRP->new(CT, 'ct:'));
+
+ while (my ($name, $pfx) = each %bool_pfx_external) {
+ $qp->add_boolean_prefix($name, $_) for split(/ /, $pfx);
+ }
+ while (my ($name, $pfx) = each %prob_prefix) {
+ $qp->add_prefix($name, $_) for split(/ /, $pfx);
+ }
+ $qp;
+}
+
+# returns a Xapian::Query to filter by roots
+sub roots_filter { # retry_reopen callback
+ my ($self, $git_dir) = @_;
+ my $xdb = $self->xdb;
+ my $P = 'P'.$git_dir;
+ my ($cur, $end) = ($xdb->postlist_begin($P), $xdb->postlist_end($P));
+ if ($cur == $end) {
+ warn "W: $git_dir not indexed?\n";
+ return;
+ }
+ my @roots = xap_terms('G', $xdb, $cur->get_docid);
+ if (!@roots) {
+ warn "W: $git_dir has no root commits?\n";
+ return;
+ }
+ my $q = $PublicInbox::Search::X{Query}->new('G'.shift(@roots));
+ for my $r (@roots) {
+ $q = $PublicInbox::Search::X{Query}->new(
+ PublicInbox::Search::OP_OR(),
+ $q, 'G'.$r);
+ }
+ $q;
+}
+
+sub mset {
+ my ($self, $qry_str, $opt) = @_;
+ my $qp = $self->{qp} //= cqparse_new($self);
+ my $qry = $qp->parse_query($qry_str, $self->{qp_flags});
+
+ # limit to commits with shared roots
+ if (defined(my $git_dir = $opt->{git_dir})) {
+ my $rf = retry_reopen($self, \&roots_filter, $git_dir)
+ or return;
+
+ $qry = $PublicInbox::Search::X{Query}->new(
+ PublicInbox::Search::OP_FILTER(),
+ $qry, $rf);
+ }
+
+ # we only want commits:
+ $qry = $PublicInbox::Search::X{Query}->new(
+ PublicInbox::Search::OP_FILTER(),
+ $qry, 'T'.'c');
+
+ my $enq = $PublicInbox::Search::X{Enquire}->new($self->xdb);
+ $enq->set_query($qry);
+ if ($opt->{relevance}) {
+ $enq->set_sort_by_relevance_then_value(CT, !$opt->{asc});
+ } else {
+ $enq->set_sort_by_value_then_relevance(CT, !$opt->{asc});
+ }
+ $self->retry_reopen($self->can('enquire_once'), $enq,
+ $opt->{offset} || 0, $opt->{limit} || 50);
+}
+
+1;
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
new file mode 100644
index 00000000..218338da
--- /dev/null
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -0,0 +1,425 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# indexer for git coderepos, just commits and repo paths for now
+# this stores normalized absolute paths of indexed GIT_DIR inside
+# the DB itself and is designed to handle forks by designating roots
+#
+# Unlike mail search, docid isn't tied to NNTP artnum or IMAP UID,
+# there's no serial number dependency at all. The first 32-bits of
+# the commit SHA-(1|256) is used to select a shard.
+#
+# We shard repos using the first 32-bits of sha256($ABS_GIT_DIR)
+#
+# See PublicInbox::CodeSearch (read-only API) for more
+package PublicInbox::CodeSearchIdx;
+use v5.12;
+use parent qw(PublicInbox::Lock PublicInbox::CodeSearch PublicInbox::SearchIdx);
+use PublicInbox::Eml;
+use PublicInbox::DS ();
+use PublicInbox::IPC qw(nproc_shards);
+use PublicInbox::Admin;
+use POSIX qw(WNOHANG SEEK_SET);
+use File::Path ();
+use File::Spec ();
+use PublicInbox::SHA qw(sha256_hex);
+use PublicInbox::Search qw(xap_terms);
+use PublicInbox::SearchIdx qw(add_val);
+use PublicInbox::Config;
+use PublicInbox::Spawn qw(run_die);
+
+# stop walking history if we see >$SEEN_MAX existing commits, this assumes
+# branches don't diverge by more than this number of commits...
+# git walks commits quickly if it doesn't have to read trees
+our $SEEN_MAX = 100000;
+
+# TODO: do we care about committer name + email? or tree OID?
+my @FMT = qw(H P ct an ae at s b); # (b)ody must be last
+my @LOG_STDIN = (qw(log --no-decorate --no-color --no-notes -p --stat -M
+ --stdin --no-walk=unsorted), '--pretty=format:%n%x00'.
+ join('%n', map { "%$_" } @FMT));
+
+sub new {
+ my (undef, $dir, $opt) = @_;
+ my $l = $opt->{indexlevel} // 'full';
+ $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and
+ die "invalid indexlevel=$l\n";
+ $l eq 'basic' and die "E: indexlevel=basic not supported\n";
+ my $self = bless {
+ xpfx => "$dir/cidx". PublicInbox::CodeSearch::CIDX_SCHEMA_VER,
+ cidx_dir => $dir,
+ creat => 1, # TODO: get rid of this, should be implicit
+ indexlevel => $l,
+ transact_bytes => 0, # for checkpoint
+ total_bytes => 0, # for lock_release
+ current_info => '',
+ parallel => 1,
+ -opt => $opt,
+ lock_path => "$dir/cidx.lock",
+ }, __PACKAGE__;
+ $self->{nshard} = count_shards($self) ||
+ nproc_shards({nproc => $opt->{jobs}});
+ $self->{-no_fsync} = 1 if !$opt->{fsync};
+ $self->{-dangerous} = 1 if $opt->{dangerous};
+ $self;
+}
+
+# TODO: may be used for reshard/compact
+sub count_shards { scalar($_[0]->xdb_shards_flat) }
+
+sub add_commit ($$) {
+ my ($self, $cmt) = @_; # fields from @FMT
+ my $x = 'Q'.$cmt->{H};
+ for (docids_by_postlist($self, $x)) {
+ $self->{xdb}->delete_document($_)
+ }
+ my $doc = $PublicInbox::Search::X{Document}->new;
+ $doc->add_boolean_term($x);
+ $doc->add_boolean_term('G'.$_) for @{$self->{roots}};
+ $doc->add_boolean_term('XP'.$_) for split(/ /, $cmt->{P});
+ $doc->add_boolean_term('T'.'c');
+
+ # Author-Time is compatible with dt: for mail search schema_version=15
+ add_val($doc, PublicInbox::CodeSearch::AT,
+ POSIX::strftime('%Y%m%d%H%M%S', gmtime($cmt->{at})));
+
+ # Commit-Time is the fallback used by rt: (TS) for mail search:
+ add_val($doc, PublicInbox::CodeSearch::CT, $cmt->{ct});
+
+ $self->term_generator->set_document($doc);
+
+ # email address is always indexed with positional data for usability
+ $self->index_phrase("$cmt->{an} <$cmt->{ae}>", 1, 'A');
+
+ $x = $cmt->{'s'};
+ $self->index_text($x, 1, 'S') if $x =~ /\S/s;
+ $doc->set_data($x); # subject is the first (and currently only) line
+
+ $x = delete $cmt->{b};
+ $self->index_body_text($doc, \$x) if $x =~ /\S/s;
+ $self->{xdb}->add_document($doc);
+}
+
+sub progress {
+ my ($self, @msg) = @_;
+ my $pr = $self->{-opt}->{-progress} or return;
+ $pr->($self->{git} ? ("$self->{git}->{git_dir}: ") : (), @msg, "\n");
+}
+
+sub store_repo ($$) {
+ my ($self, $repo) = @_;
+ my $xdb = delete($repo->{shard})->idx_acquire;
+ $xdb->begin_transaction;
+ if (defined $repo->{id}) {
+ my $doc = $xdb->get_document($repo->{id}) //
+ die "$self->{git}->{git_dir} doc #$repo->{id} gone";
+ add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct});
+ my %new = map { $_ => undef } @{$self->{roots}};
+ my $old = xap_terms('G', $doc);
+ delete @new{keys %$old};
+ $doc->add_boolean_term('G'.$_) for keys %new;
+ delete @$old{@{$self->{roots}}};
+ $doc->remove_term('G'.$_) for keys %$old;
+ $doc->set_data($repo->{fp});
+ $xdb->replace_document($repo->{id}, $doc);
+ } else {
+ my $new = $PublicInbox::Search::X{Document}->new;
+ add_val($new, PublicInbox::CodeSearch::CT, $repo->{ct});
+ $new->add_boolean_term("P$self->{git}->{git_dir}");
+ $new->add_boolean_term('T'.'r');
+ $new->add_boolean_term('G'.$_) for @{$repo->{roots}};
+ $new->set_data($repo->{fp}); # \n delimited
+ $xdb->add_document($new);
+ }
+ $xdb->commit_transaction;
+}
+
+# sharded reader for `git log --pretty=format: --stdin'
+sub shard_worker ($$$) {
+ my ($self, $r, $sigset) = @_;
+ my ($quit, $cmt);
+ my $batch_bytes = $self->{-opt}->{batch_size} //
+ $PublicInbox::SearchIdx::BATCH_BYTES;
+ my $max = $batch_bytes;
+ $SIG{USR1} = sub { $max = -1 }; # similar to `git fast-import'
+ $SIG{QUIT} = $SIG{TERM} = $SIG{INT} = sub { $quit = shift };
+ PublicInbox::DS::sig_setmask($sigset);
+
+ # the parent process of this shard process writes directly to
+ # the stdin of `git log', we consume git log's stdout:
+ my $rd = $self->{git}->popen(@LOG_STDIN, undef, { 0 => $r });
+ close $r or die "close: $!";
+ my $nr = 0;
+
+ # a patch may have \0, see c4201214cbf10636e2c1ab9131573f735b42c8d4
+ # in linux.git, so we use $/ = "\n\0" to check end-of-patch
+ my $FS = "\n\0";
+ local $/ = $FS;
+ my $buf = <$rd> // return; # leading $FS
+ $buf eq $FS or die "BUG: not LF-NUL: $buf\n";
+ my $xdb = $self->idx_acquire;
+ $xdb->begin_transaction;
+ while (defined($buf = <$rd>)) {
+ chomp($buf);
+ $max -= length($buf);
+ @$cmt{@FMT} = split(/\n/, $buf, scalar(@FMT));
+ $/ = "\n";
+ add_commit($self, $cmt);
+ last if $quit; # likely SIGPIPE
+ ++$nr;
+ if ($max <= 0 && !$PublicInbox::Search::X{CLOEXEC_UNSET}) {
+ progress($self, $nr);
+ $xdb->commit_transaction;
+ $max = $batch_bytes;
+ $xdb->begin_transaction;
+ }
+ $/ = $FS;
+ }
+ close($rd);
+ if (!$? || ($quit && ($? & 127) == POSIX::SIGPIPE)) {
+ $xdb->commit_transaction;
+ } else {
+ warn "E: git @LOG_STDIN: \$?=$?\n";
+ $xdb->cancel_transaction;
+ }
+}
+
+sub seen ($$) {
+ my ($xdb, $q) = @_; # $q = "Q$COMMIT_HASH"
+ $xdb->postlist_begin($q) != $xdb->postlist_end($q)
+}
+
+# used to select the shard for a GIT_DIR
+sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) }
+
+sub docids_by_postlist ($$) { # consider moving to PublicInbox::Search
+ my ($self, $q) = @_;
+ my $cur = $self->{xdb}->postlist_begin($q);
+ my $end = $self->{xdb}->postlist_end($q);
+ my @ids;
+ for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) };
+ @ids;
+}
+
+sub get_roots ($$) {
+ my ($self, $refs) = @_;
+ my @roots = $self->{git}->qx([qw(rev-list --stdin --max-parents=0)],
+ undef, { 0 => $refs });
+ die "git rev-list \$?=$?" if $?;
+ sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
+ chomp(@roots);
+ scalar(@roots) ? \@roots : undef;
+}
+
+# this is different from the grokmirror-compatible fingerprint since we
+# only care about --heads (branches) and --tags, and not even their names
+sub cidx_fp ($) {
+ my ($self) = @_;
+ open my $refs, '+>', undef or die "open: $!";
+ run_die(['git', "--git-dir=$self->{git}->{git_dir}",
+ qw(show-ref --heads --tags --hash)], undef, { 1 => $refs });
+ seek($refs, 0, SEEK_SET) or die "seek: $!";
+ my $buf;
+ my $dig = PublicInbox::SHA->new(256);
+ while (read($refs, $buf, 65536)) { $dig->add($buf) }
+ sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
+ ($dig->hexdigest, $refs);
+}
+
+# TODO: should we also index gitweb.owner and the full fingerprint for grokmirror?
+sub prep_git_dir ($) {
+ my ($self) = @_;
+ my $git_dir = $self->{git}->{git_dir};
+ my $ct = $self->{git}->qx([qw[for-each-ref
+ --sort=-committerdate --format=%(committerdate:raw) --count=1
+ refs/heads/ refs/tags/]]);
+ my $repo = {};
+ @$repo{qw(fp refs)} = cidx_fp($self);
+ $repo->{roots} = get_roots($self, $repo->{refs});
+ if (!$repo->{roots} || !defined($ct)) {
+ warn "W: $git_dir has no root commits, skipping\n";
+ return;
+ }
+ $ct =~ s/ .*\z//s; # drop TZ
+ $repo->{ct} = $ct + 0;
+ my $n = git_dir_hash($git_dir) % $self->{nshard};
+ my $shard = $repo->{shard} = bless { %$self, shard => $n }, ref($self);
+ delete @$shard{qw(lockfh lock_path)};
+ local $shard->{xdb};
+ my $xdb = $shard->idx_acquire;
+ my @docids = docids_by_postlist($shard, 'P'.$git_dir);
+ my $docid = shift(@docids) // return $repo;
+ if (@docids) {
+ warn "BUG: $git_dir indexed multiple times, culling\n";
+ $xdb->begin_transaction;
+ for (@docids) { $xdb->delete_document($_) }
+ $xdb->commit_transaction;
+ }
+ my $doc = $xdb->get_document($docid) //
+ die "BUG: no #$docid ($git_dir)";
+ my $old_fp = $doc->get_data;
+ if ($old_fp eq $repo->{fp}) { # no change
+ progress($self, 'unchanged');
+ return;
+ }
+ $repo->{id} = $docid;
+ $repo;
+}
+
+sub partition_refs ($$) {
+ my ($self, $refs) = @_; # show-ref --heads --tags --hash output
+ my $fh = $self->{git}->popen(qw(rev-list --stdin), undef,
+ { 0 => $refs });
+ close $refs or die "close: $!";
+ local $self->{xdb};
+ my $xdb = $self->{-opt}->{reindex} ? undef : $self->xdb;
+ my ($seen, $nchange, $nshard) = (0, 0, $self->{nshard});
+ my @shard_in;
+ for (0..($nshard - 1)) {
+ open $shard_in[$_], '+>', undef or die "open: $!";
+ }
+ while (defined(my $cmt = <$fh>)) {
+ chomp $cmt;
+ if ($xdb && seen($xdb, 'Q'.$cmt)) {
+ last if ++$seen > $SEEN_MAX;
+ } else {
+ my $n = hex(substr($cmt, 0, 8)) % $nshard;
+ say { $shard_in[$n] } $cmt or die "say: $!";
+ ++$nchange;
+ $seen = 0;
+ }
+ }
+ close($fh);
+ if (!$? || (($? & 127) == POSIX::SIGPIPE && $seen > $SEEN_MAX)) {
+ $self->{nchange} += $nchange;
+ progress($self, "$nchange commits");
+ for my $fh (@shard_in) {
+ $fh->flush or die "flush: $!";
+ sysseek($fh, 0, SEEK_SET) or die "seek: $!";
+ }
+ return @shard_in;
+ }
+ die "git-rev-list: \$?=$?\n";
+}
+
+sub index_git_dir ($$) {
+ my ($self, $git_dir) = @_;
+ local $self->{git} = PublicInbox::Git->new($git_dir); # for ->patch_id
+ my $repo = prep_git_dir($self) or return;
+ local $self->{current_info} = $git_dir;
+ my @shard_in = partition_refs($self, delete($repo->{refs}));
+ my %pids;
+ my $fwd_kill = sub {
+ my ($sig) = @_;
+ kill($sig, $_) for keys %pids;
+ };
+ local $SIG{USR1} = $fwd_kill;
+ local $SIG{QUIT} = $fwd_kill;
+ local $SIG{INT} = $fwd_kill;
+ local $SIG{TERM} = $fwd_kill;
+ my $sigset = PublicInbox::DS::block_signals();
+ for (my $n = 0; $n <= $#shard_in; $n++) {
+ -s $shard_in[$n] or next;
+ my $pid = fork // die "fork: $!";
+ if ($pid == 0) { # no RNG use, here
+ $0 = "code index [$n]";
+ $self->{shard} = $n;
+ $self->{current_info} = "$self->{current_info} [$n]";
+ delete @$self{qw(lockfh lock_path)};
+ my $in = $shard_in[$n];
+ @shard_in = ();
+ $self->{roots} = delete $repo->{roots};
+ undef $repo;
+ eval { shard_worker($self, $in, $sigset) };
+ warn "E: $@" if $@;
+ POSIX::_exit($@ ? 1 : 0);
+ } else {
+ $pids{$pid} = "code index [$n]";
+ }
+ }
+ PublicInbox::DS::sig_setmask($sigset);
+ @shard_in = ();
+ my $err;
+ while (keys %pids) {
+ my $pid = waitpid(-1, 0) or last;
+ my $j = delete $pids{$pid} // "unknown PID:$pid";
+ next if $? == 0;
+ warn "PID:$pid $j exited with \$?=$?\n";
+ $err = 1;
+ }
+ die "subprocess(es) failed\n" if $err;
+ store_repo($self, $repo);
+ progress($self, 'done');
+ # TODO: check fp afterwards?
+}
+
+# for PublicInbox::SearchIdx::patch_id and with_umask
+sub git { $_[0]->{git} }
+
+sub load_existing ($) { # for -u/--update
+ my ($self) = @_;
+ my $dirs = $self->{git_dirs} // [];
+ if ($self->{-opt}->{update}) {
+ local $self->{xdb};
+ $self->xdb or
+ die "E: $self->{cidx_dir} non-existent for --update\n";
+ my @cur = $self->all_terms('P');
+ push @$dirs, @cur;
+ }
+ my %uniq; # List::Util::uniq requires Perl 5.26+
+ @$dirs = grep { !$uniq{$_}++ } @$dirs;
+}
+
+sub cidx_init ($) {
+ my ($self) = @_;
+ my $dir = $self->{cidx_dir};
+ unless (-d $dir) {
+ warn "# creating $dir\n" if !$self->{-opt}->{quiet};
+ File::Path::mkpath($dir);
+ }
+ for my $n (0..($self->{nshard} - 1)) {
+ my $shard = bless { %$self, shard => $n }, ref($self);
+ $shard->idx_acquire;
+ }
+ # this warning needs to happen after idx_acquire
+ state $once;
+ warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++;
+W: Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks,
+W: memory usage may be high for large indexing runs
+EOM
+}
+
+sub cidx_run {
+ my ($self) = @_;
+ cidx_init($self);
+ local $self->{current_info} = '';
+ my $cb = $SIG{__WARN__} || \&CORE::warn;
+ local $SIG{__WARN__} = sub {
+ my $m = shift @_;
+ $self->{current_info} eq '' or
+ $m =~ s/\A(#?\s*)/$1$self->{current_info}: /;
+ $cb->($m, @_);
+ };
+ $self->lock_acquire;
+ load_existing($self);
+ my @nc = grep { File::Spec->canonpath($_) ne $_ } @{$self->{git_dirs}};
+ if (@nc) {
+ warn "E: BUG? paths in $self->{cidx_dir} not canonicalized:\n";
+ for my $d (@{$self->{git_dirs}}) {
+ my $c = File::Spec->canonpath($_);
+ warn "E: $d => $c\n";
+ $d = $c;
+ }
+ warn "E: canonicalized and attempting to continue\n";
+ }
+ local $self->{nchange} = 0;
+ # do_prune($self) if $self->{-opt}->{prune}; TODO
+ if ($self->{-opt}->{scan} // 1) {
+ for my $gd (@{$self->{git_dirs}}) {
+ index_git_dir($self, $gd);
+ }
+ }
+ $self->lock_release(!!$self->{nchange});
+}
+
+1;
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
index 19200b92..6708527d 100644
--- a/lib/PublicInbox/MiscIdx.pm
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -5,7 +5,7 @@
# Things indexed include:
# * inboxes themselves
# * epoch information
-# * (maybe) git code repository information
+# * (maybe) git code repository information (not commits)
# Expect ~100K-1M documents with no parallelism opportunities,
# so no sharding, here.
#
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7aba2445..5133a3b7 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -110,43 +110,50 @@ sub load_xapian () {
# a prefix common in patch emails
our $LANG = 'english';
+our %PATCH_BOOL_COMMON = (
+ dfpre => 'XDFPRE',
+ dfpost => 'XDFPOST',
+ dfblob => 'XDFPRE XDFPOST',
+ patchid => 'XDFID',
+);
+
# note: the non-X term prefix allocations are shared with
# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
my %bool_pfx_external = (
mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
lid => 'G', # newsGroup (or similar entity), just inside <>
- dfpre => 'XDFPRE',
- dfpost => 'XDFPOST',
- dfblob => 'XDFPRE XDFPOST',
- patchid => 'XDFID',
+ %PATCH_BOOL_COMMON
);
-my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
-my %prob_prefix = (
- # for mairix compatibility
+# for mairix compatibility
+our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
+our %PATCH_PROB_COMMON = (
s => 'S',
- m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
- l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
f => 'A',
- t => 'XTO',
- tc => 'XTO XCC',
- c => 'XCC',
- tcf => 'XTO XCC A',
- a => 'XTO XCC A',
- b => $non_quoted_body . ' XQUOT',
- bs => $non_quoted_body . ' XQUOT S',
+ b => $NON_QUOTED_BODY . ' XQUOT',
+ bs => $NON_QUOTED_BODY . ' XQUOT S',
n => 'XFN',
q => 'XQUOT',
- nq => $non_quoted_body,
+ nq => $NON_QUOTED_BODY,
dfn => 'XDFN',
dfa => 'XDFA',
dfb => 'XDFB',
dfhh => 'XDFHH',
dfctx => 'XDFCTX',
+);
+my %prob_prefix = (
+ m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+ l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
+ t => 'XTO',
+ tc => 'XTO XCC',
+ c => 'XCC',
+ tcf => 'XTO XCC A',
+ a => 'XTO XCC A',
+ %PATCH_PROB_COMMON,
# default:
- '' => 'XM S A XQUOT XFN ' . $non_quoted_body,
+ '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY,
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
@@ -305,7 +312,7 @@ sub date_parse_prepare {
$x = "\0%Y%m%d%H%M%S$#$to_parse\0";
}
}
- } else { # "rt", let git interpret "YYYY", deal with Y10K later :P
+ } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P
for my $x (@r) {
next if $x eq '' || $x =~ /\A[0-9]{5,}\z/;
push @$to_parse, $x;
@@ -454,20 +461,24 @@ sub mset_to_smsg {
# read-write
sub stemmer { $X{Stem}->new($LANG) }
-# read-only
-sub qparse_new {
+sub qp_init_common {
my ($self) = @_;
-
- my $xdb = xdb($self);
my $qp = $X{QueryParser}->new;
$qp->set_default_op(OP_AND());
- $qp->set_database($xdb);
+ $qp->set_database(xdb($self));
$qp->set_stemmer(stemmer($self));
$qp->set_stemming_strategy(STEM_SOME());
my $cb = $qp->can('set_max_wildcard_expansion') //
$qp->can('set_max_expansion'); # Xapian 1.5.0+
$cb->($qp, 100);
- $cb = $qp->can('add_valuerangeprocessor') //
+ $qp;
+}
+
+# read-only
+sub qparse_new {
+ my ($self) = @_;
+ my $qp = qp_init_common($self);
+ my $cb = $qp->can('add_valuerangeprocessor') //
$qp->can('add_rangeprocessor'); # Xapian 1.5.0+
$cb->($qp, $NVRP->new(YYYYMMDD, 'd:'));
$cb->($qp, $NVRP->new(DT, 'dt:'));
@@ -546,7 +557,7 @@ sub xap_terms ($$;@) {
}
# get combined docid from over.num:
-# (not generic Xapian, only works with our sharding scheme)
+# (not generic Xapian, only works with our sharding scheme for mail)
sub num2docid ($$) {
my ($self, $num) = @_;
my $nshard = $self->{nshard};
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index fc464383..3baeaa9c 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -134,6 +134,7 @@ sub idx_acquire {
load_xapian_writable();
$flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN;
}
+ my $owner = $self->{ibx} // $self->{eidx} // $self;
if ($self->{creat}) {
require File::Path;
$self->lock_acquire;
@@ -145,14 +146,13 @@ sub idx_acquire {
File::Path::mkpath($dir);
require PublicInbox::Syscall;
PublicInbox::Syscall::nodatacow_dir($dir);
- $self->{-set_has_threadid_once} = 1;
- if (($self->{ibx} // $self->{eidx})->{-dangerous}) {
- $flag |= $DB_DANGEROUS;
- }
+ # owner == self for CodeSearchIdx
+ $self->{-set_has_threadid_once} = 1 if $owner != $self;
+ $flag |= $DB_DANGEROUS if $owner->{-dangerous};
}
}
return unless defined $flag;
- $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync};
+ $flag |= $DB_NO_SYNC if $owner->{-no_fsync};
my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) };
croak "Failed opening $dir: $@" if $@;
$self->{xdb} = $xdb;
@@ -350,43 +350,30 @@ sub index_diff ($$$) {
index_text($self, join("\n", @$xnq), 1, 'XNQ');
}
-sub index_xapian { # msg_iter callback
- my $part = $_[0]->[0]; # ignore $depth and $idx
- my ($self, $doc) = @{$_[1]};
- my $ct = $part->content_type || 'text/plain';
- my $fn = $part->filename;
- if (defined $fn && $fn ne '') {
- index_phrase($self, $fn, 1, 'XFN');
- }
- if ($part->{is_submsg}) {
- my $mids = mids_for_index($part);
- index_ids($self, $doc, $part, $mids);
- my $smsg = bless {}, 'PublicInbox::Smsg';
- $smsg->populate($part);
- index_headers($self, $smsg);
- }
-
- my ($s, undef) = msg_part_text($part, $ct);
- defined $s or return;
- $_[0]->[0] = $part = undef; # free memory
+sub patch_id {
+ my ($self) = @_; # $_[1] is the diff (may be huge)
+ open(my $fh, '+>:utf8', undef) or die "open: $!";
+ open(my $eh, '+>', undef) or die "open: $!";
+ $fh->autoflush(1);
+ print $fh $_[1] or die "print: $!";
+ sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
+ my $id = ($self->{ibx} // $self->{eidx} // $self)->git->qx(
+ [qw(patch-id --stable)], {}, { 0 => $fh, 2 => $eh });
+ seek($eh, 0, SEEK_SET) or die "seek: $!";
+ while (<$eh>) { warn $_ }
+ $id =~ /\A([a-f0-9]{40,})/ ? $1 : undef;
+}
- if ($s =~ /^(?:diff|---|\+\+\+) /ms) {
- open(my $fh, '+>:utf8', undef) or die "open: $!";
- open(my $eh, '+>', undef) or die "open: $!";
- $fh->autoflush(1);
- print $fh $s or die "print: $!";
- sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
- my $id = ($self->{ibx} // $self->{eidx})->git->qx(
- [qw(patch-id --stable)],
- {}, { 0 => $fh, 2 => $eh });
- $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1);
- seek($eh, 0, SEEK_SET) or die "seek: $!";
- while (<$eh>) { warn $_ }
+sub index_body_text {
+ my ($self, $doc, $sref) = @_;
+ if ($$sref =~ /^(?:diff|---|\+\+\+) /ms) {
+ my $id = patch_id($self, $$sref);
+ $doc->add_term('XDFID'.$id) if defined($id);
}
# split off quoted and unquoted blocks:
- my @sections = PublicInbox::MsgIter::split_quotes($s);
- undef $s; # free memory
+ my @sections = PublicInbox::MsgIter::split_quotes($$sref);
+ undef $$sref; # free memory
for my $txt (@sections) {
if ($txt =~ /\A>/) {
if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) {
@@ -396,8 +383,7 @@ sub index_xapian { # msg_iter callback
(?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx;
}
index_text($self, $txt, 0, 'XQUOT');
- } else {
- # does it look like a diff?
+ } else { # does it look like a diff?
if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
index_diff($self, \$txt, $doc);
} else {
@@ -408,6 +394,28 @@ sub index_xapian { # msg_iter callback
}
}
+sub index_xapian { # msg_iter callback
+ my $part = $_[0]->[0]; # ignore $depth and $idx
+ my ($self, $doc) = @{$_[1]};
+ my $ct = $part->content_type || 'text/plain';
+ my $fn = $part->filename;
+ if (defined $fn && $fn ne '') {
+ index_phrase($self, $fn, 1, 'XFN');
+ }
+ if ($part->{is_submsg}) {
+ my $mids = mids_for_index($part);
+ index_ids($self, $doc, $part, $mids);
+ my $smsg = bless {}, 'PublicInbox::Smsg';
+ $smsg->populate($part);
+ index_headers($self, $smsg);
+ }
+
+ my ($s, undef) = msg_part_text($part, $ct);
+ defined $s or return;
+ $_[0]->[0] = $part = undef; # free memory
+ index_body_text($self, $doc, \$s);
+}
+
sub index_list_id ($$$) {
my ($self, $doc, $hdr) = @_;
for my $l ($hdr->header_raw('List-Id')) {
diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm
index ed28ac48..494323c0 100644
--- a/lib/PublicInbox/TestCommon.pm
+++ b/lib/PublicInbox/TestCommon.pm
@@ -21,6 +21,7 @@ BEGIN {
@EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods
run_script start_script key2sub xsys xsys_e xqx eml_load tick
have_xapian_compact json_utf8 setup_public_inboxes create_inbox
+ create_coderepo
tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt
test_httpd xbail require_cmd is_xdeeply tail_f
ignore_inline_c_missing);
@@ -325,7 +326,7 @@ sub run_script ($;$$) {
}
}
my $tail = @tail_paths ? tail_f(@tail_paths) : undef;
- if ($key =~ /-(index|convert|extindex|convert|xcpdb)\z/) {
+ if ($key =~ /-(index|cindex|extindex|convert|xcpdb)\z/) {
unshift @argv, '--no-fsync';
}
if ($run_mode == 0) {
@@ -698,6 +699,44 @@ sub setup_public_inboxes () {
@ret;
}
+our %COMMIT_ENV = (
+ GIT_AUTHOR_NAME => 'A U Thor',
+ GIT_COMMITTER_NAME => 'C O Mitter',
+ GIT_AUTHOR_EMAIL => 'a@example.com',
+ GIT_COMMITTER_EMAIL => 'c@example.com',
+);
+
+sub create_coderepo ($$;@) {
+ my $ident = shift;
+ my $cb = pop;
+ my %opt = @_;
+ require PublicInbox::Lock;
+ require PublicInbox::Import;
+ my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!);
+ my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!);
+ my $dir = "t/data-gen/$base.$ident-$db";
+ my $new = !-d $dir;
+ if ($new && !mkdir($dir)) {
+ my $err = $!;
+ -d $dir or xbail "mkdir($dir): $err";
+ }
+ my $lk = bless { lock_path => "$dir/creat.lock" }, 'PublicInbox::Lock';
+ my $scope = $lk->lock_for_scope;
+ my $tmpdir = delete $opt{tmpdir};
+ if (!-f "$dir/creat.stamp") {
+ opendir(my $dfh, '.') or xbail "opendir .: $!";
+ chdir($dir) or xbail "chdir($dir): $!";
+ local %ENV = (%ENV, %COMMIT_ENV);
+ $cb->($dir);
+ chdir($dfh) or xbail "cd -: $!";
+ open my $s, '>', "$dir/creat.stamp" or
+ BAIL_OUT "error creating $dir/creat.stamp: $!";
+ }
+ return $dir if !defined($tmpdir);
+ xsys_e([qw(/bin/cp -Rp), $dir, $tmpdir]);
+ $tmpdir;
+}
+
sub create_inbox ($$;@) {
my $ident = shift;
my $cb = pop;
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index de8600ee..716582a6 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -34,7 +34,7 @@ my $hl = eval {
my %QP_MAP = ( A => 'oid_a', a => 'path_a', b => 'path_b' );
our $MAX_SIZE = 1024 * 1024; # TODO: configurable
-my $BIN_DETECT = 8000; # same as git
+my $BIN_DETECT = 8000; # same as git (buffer_is_binary())
my $SHOW_FMT = '--pretty=format:'.join('%n', '%P', '%p', '%H', '%T', '%s', '%f',
'%an <%ae> %ai', '%cn <%ce> %ci', '%b%x00');
diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex
new file mode 100755
index 00000000..d3a5bfca
--- /dev/null
+++ b/script/public-inbox-cindex
@@ -0,0 +1,75 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
+usage: public-inbox-cindex [options] GIT_DIR...
+usage: public-inbox-cindex [options] --project-list=FILE PROJECT_ROOT
+
+ Create and update search indices for code repos
+
+ -d DIR use DIR instead of GIT_DIR/public-inbox-cindex
+ --no-fsync speed up indexing, risk corruption on power outage
+ -L LEVEL `medium', or `full' (default: medium)
+ --project-list=FILE use a cgit/gitweb-compatible list of projects
+ --update | -u update previously-indexed code repos with `-d'
+ --jobs=NUM set or disable parallelization (NUM=0)
+ --batch-size=BYTES flush changes to OS after a given number of bytes
+ --prune prune old repos and commits
+ --reindex reindex previously indexed repos
+ --verbose | -v increase verbosity (may be repeated)
+
+BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
+See public-inbox-cindex(1) man page for full documentation.
+EOF
+my $opt = { fsync => 1, scan => 1 }; # --no-scan is hidden
+GetOptions($opt, qw(quiet|q verbose|v+ reindex jobs|j=i fsync|sync! dangerous
+ indexlevel|index-level|L=s batch_size|batch-size=s
+ project-list=s
+ d=s update|u scan! prune dry-run|n C=s@ help|h))
+ or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in cidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+PublicInbox::Admin::do_chdir(delete $opt->{C});
+my $cfg = PublicInbox::Config->new;
+my $cidx_dir = $opt->{d};
+PublicInbox::Admin::require_or_die('Search::Xapian');
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+%ENV = (%ENV, %$env) if $env;
+
+require PublicInbox::CodeSearchIdx; # unstable internal API
+my @git_dirs;
+if (defined(my $pl = $opt->{'project-list'})) {
+ my $pfx = shift @ARGV // die <<EOM;
+PROJECTS_DIR required for --project-list
+EOM
+ open my $fh, '<', $pl or die "open($pl): $!\n";
+ chomp(@git_dirs = <$fh>);
+ $_ = PublicInbox::Admin::resolve_git_dir("$pfx/$_") for @git_dirs;
+} else {
+ @git_dirs = map { PublicInbox::Admin::resolve_git_dir($_) } @ARGV;
+}
+if (defined $cidx_dir) { # external index
+ die "`%' is not allowed in $cidx_dir\n" if $cidx_dir =~ /\%/;
+ my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt);
+ @{$cidx->{git_dirs}} = @git_dirs; # may be empty
+ $cidx->cidx_run;
+} elsif (!@git_dirs) {
+ die $help
+} else {
+ for my $gd (@git_dirs) {
+ my $cd = "$gd/public-inbox-cindex";
+ my $cidx = PublicInbox::CodeSearchIdx->new($cd, { %$opt });
+ $cidx->{-internal} = 1;
+ @{$cidx->{git_dirs}} = ($gd);
+ $cidx->cidx_run;
+ }
+}
diff --git a/t/cindex.t b/t/cindex.t
new file mode 100644
index 00000000..c93e4e4e
--- /dev/null
+++ b/t/cindex.t
@@ -0,0 +1,98 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use PublicInbox::TestCommon;
+use Cwd qw(getcwd abs_path);
+require_mods(qw(json Search::Xapian));
+use_ok 'PublicInbox::CodeSearchIdx';
+require PublicInbox::Import;
+my ($tmp, $for_destroy) = tmpdir();
+my $pwd = getcwd();
+
+# I reworked CodeSearchIdx->shard_worker to handle empty trees
+# in the initial commit generated by cvs2svn for xapian.git
+create_coderepo 'empty-tree-root', tmpdir => "$tmp/wt0", sub {
+ xsys_e([qw(/bin/sh -c), <<'EOM']);
+git init -q &&
+tree=$(git mktree </dev/null) &&
+head=$(git symbolic-ref HEAD) &&
+cmt=$(echo 'empty root' | git commit-tree $tree) &&
+git update-ref $head $cmt &&
+echo hi >f &&
+git add f &&
+git commit -q -m hi &&
+git gc -q
+EOM
+}; # /create_coderepo
+
+ok(run_script([qw(-cindex --dangerous -q), "$tmp/wt0"]), 'cindex internal');
+ok(-e "$tmp/wt0/.git/public-inbox-cindex/cidx.lock", 'internal dir created');
+
+
+# it's possible for git to emit NUL characters in diffs
+# (see c4201214cbf10636e2c1ab9131573f735b42c8d4 in linux.git)
+my $zp = create_coderepo 'NUL in patch', sub {
+ require PublicInbox::Git;
+ my $src = PublicInbox::Git::try_cat("$pwd/COPYING");
+ xsys_e([qw(git init -q)]);
+
+ # needs to be further than FIRST_FEW_BYTES (8000) in git.git
+ $src =~ s/\b(Limitation of Liability\.)\n\n/$1\n\0\n/s or
+ xbail "BUG: no `\\n\\n' in $pwd/COPYING";
+
+ open my $fh, '>', 'f' or xbail "open: $!";
+ print $fh $src or xbail "print: $!";
+ close $fh or xbail "close: $!";
+ xsys_e([qw(/bin/sh -c), <<'EOM']);
+git add f &&
+git commit -q -m 'initial with NUL character'
+EOM
+ $src =~ s/\n\0\n/\n\n/ or xbail "BUG: no `\\n\\0\\n'";
+ open $fh, '>', 'f' or xbail "open: $!";
+ print $fh $src or xbail "print: $!";
+ close $fh or xbail "close: $!";
+ xsys_e([qw(/bin/sh -c), <<'EOM']);
+git add f &&
+git commit -q -m 'remove NUL character' &&
+git gc -q
+EOM
+}; # /create_coderepo
+
+ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp, "$tmp/wt0"]),
+ 'cindex external');
+ok(-e "$tmp/ext/cidx.lock", 'external dir created');
+ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo');
+
+use_ok 'PublicInbox::CodeSearch';
+if ('multi-repo search') {
+ my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+ my $mset = $csrch->mset('NUL');
+ is(scalar($mset->items), 2, 'got results');
+ my $exp = [ 'initial with NUL character', 'remove NUL character' ];
+ my @have = sort(map { $_->get_document->get_data } $mset->items);
+ is_xdeeply(\@have, $exp, 'got expected subjects');
+
+ $mset = $csrch->mset('NUL', { git_dir => "$tmp/wt0/.git" });
+ is(scalar($mset->items), 0, 'no results with other GIT_DIR');
+
+ $mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") });
+ @have = sort(map { $_->get_document->get_data } $mset->items);
+ is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter');
+}
+
+if ('--update') {
+ my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+ my $mset = $csrch->mset('dfn:for-update');
+ is(scalar($mset->items), 0, 'no result before update');
+
+ my $e = \%PublicInbox::TestCommon::COMMIT_ENV;
+ xsys_e([qw(/bin/sh -c), <<'EOM'], $e, { -C => "$tmp/wt0" });
+>for-update && git add for-update && git commit -q -m updated
+EOM
+ ok(run_script([qw(-cindex -qu -d), "$tmp/ext"]), '-cindex -u');
+ $mset = $csrch->reopen->mset('dfn:for-update');
+ is(scalar($mset->items), 1, 'got updated result');
+}
+
+done_testing;
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 07/10] cindex: parallelize prep phases
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
` (4 preceding siblings ...)
2023-03-16 20:01 ` [PATCH 06/10] codesearch: initial cut w/ -cindex tool Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 08/10] cindex: use read-only shards during " Eric Wong
` (2 subsequent siblings)
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
Listing refs, fingerprinting and root scanning can all be
parallelized to reduce runtime on SMP systems.
We'll use DESTROY-based dependency management with
parallelizagion as in LeiMirror to handle ref listing and
fingerprinting before serializing Xapian DB access to check
against the existing fingerprint.
We'll also delay root listing until we get a fingerprint
mismatch to speed up no-op indexing.
---
lib/PublicInbox/CodeSearchIdx.pm | 197 +++++++++++++++++++++----------
1 file changed, 132 insertions(+), 65 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 218338da..a926886e 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -26,7 +26,10 @@ use PublicInbox::SHA qw(sha256_hex);
use PublicInbox::Search qw(xap_terms);
use PublicInbox::SearchIdx qw(add_val);
use PublicInbox::Config;
-use PublicInbox::Spawn qw(run_die);
+use PublicInbox::Spawn qw(spawn);
+use PublicInbox::OnDestroy;
+our $LIVE; # pid => callback
+our $LIVE_JOBS;
# stop walking history if we see >$SEEN_MAX existing commits, this assumes
# branches don't diverge by more than this number of commits...
@@ -106,26 +109,27 @@ sub progress {
$pr->($self->{git} ? ("$self->{git}->{git_dir}: ") : (), @msg, "\n");
}
-sub store_repo ($$) {
- my ($self, $repo) = @_;
+sub store_repo ($$$) {
+ my ($self, $git, $repo) = @_;
my $xdb = delete($repo->{shard})->idx_acquire;
$xdb->begin_transaction;
+ for (@{$repo->{to_delete}}) { $xdb->delete_document($_) } # XXX needed?
if (defined $repo->{id}) {
my $doc = $xdb->get_document($repo->{id}) //
- die "$self->{git}->{git_dir} doc #$repo->{id} gone";
+ die "$git->{git_dir} doc #$repo->{id} gone";
add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct});
- my %new = map { $_ => undef } @{$self->{roots}};
+ my %new = map { $_ => undef } @{$repo->{roots}};
my $old = xap_terms('G', $doc);
delete @new{keys %$old};
$doc->add_boolean_term('G'.$_) for keys %new;
- delete @$old{@{$self->{roots}}};
+ delete @$old{@{$repo->{roots}}};
$doc->remove_term('G'.$_) for keys %$old;
$doc->set_data($repo->{fp});
$xdb->replace_document($repo->{id}, $doc);
} else {
my $new = $PublicInbox::Search::X{Document}->new;
add_val($new, PublicInbox::CodeSearch::CT, $repo->{ct});
- $new->add_boolean_term("P$self->{git}->{git_dir}");
+ $new->add_boolean_term("P$git->{git_dir}");
$new->add_boolean_term('T'.'r');
$new->add_boolean_term('G'.$_) for @{$repo->{roots}};
$new->set_data($repo->{fp}); # \n delimited
@@ -201,75 +205,98 @@ sub docids_by_postlist ($$) { # consider moving to PublicInbox::Search
@ids;
}
-sub get_roots ($$) {
- my ($self, $refs) = @_;
- my @roots = $self->{git}->qx([qw(rev-list --stdin --max-parents=0)],
- undef, { 0 => $refs });
- die "git rev-list \$?=$?" if $?;
- sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
- chomp(@roots);
- scalar(@roots) ? \@roots : undef;
+sub cidx_reap ($$) {
+ my ($self, $jobs) = @_;
+ while (keys(%$LIVE) >= $jobs) {
+ my $pid = waitpid(-1, 0) // die "waitpid(-1): $!";
+ last if $pid < 0;
+ if (my $x = delete $LIVE->{$pid}) {
+ my $cb = shift @$x;
+ $cb->(@$x) if $cb;
+ } else {
+ warn "reaped unknown PID=$pid ($?)\n";
+ }
+ }
}
# this is different from the grokmirror-compatible fingerprint since we
# only care about --heads (branches) and --tags, and not even their names
-sub cidx_fp ($) {
- my ($self) = @_;
+sub fp_start ($$$) {
+ my ($self, $git, $prep_repo) = @_;
+ return if !$LIVE; # premature exit
+ cidx_reap($self, $LIVE_JOBS);
open my $refs, '+>', undef or die "open: $!";
- run_die(['git', "--git-dir=$self->{git}->{git_dir}",
+ my $pid = spawn(['git', "--git-dir=$git->{git_dir}",
qw(show-ref --heads --tags --hash)], undef, { 1 => $refs });
+ $git->{-repo}->{refs} = $refs;
+ $LIVE->{$pid} = [ \&fp_fini, $self, $git, $prep_repo ];
+}
+
+sub fp_fini {
+ my ($self, $git, $prep_repo) = @_;
+ my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
seek($refs, 0, SEEK_SET) or die "seek: $!";
my $buf;
my $dig = PublicInbox::SHA->new(256);
while (read($refs, $buf, 65536)) { $dig->add($buf) }
- sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
- ($dig->hexdigest, $refs);
+ $git->{-repo}->{fp} = $dig->hexdigest;
}
-# TODO: should we also index gitweb.owner and the full fingerprint for grokmirror?
-sub prep_git_dir ($) {
- my ($self) = @_;
- my $git_dir = $self->{git}->{git_dir};
- my $ct = $self->{git}->qx([qw[for-each-ref
- --sort=-committerdate --format=%(committerdate:raw) --count=1
+sub ct_start ($$$) {
+ my ($self, $git, $prep_repo) = @_;
+ return if !$LIVE; # premature exit
+ cidx_reap($self, $LIVE_JOBS);
+ my ($rd, $pid) = $git->popen([qw[for-each-ref --sort=-committerdate
+ --format=%(committerdate:raw) --count=1
refs/heads/ refs/tags/]]);
- my $repo = {};
- @$repo{qw(fp refs)} = cidx_fp($self);
- $repo->{roots} = get_roots($self, $repo->{refs});
- if (!$repo->{roots} || !defined($ct)) {
- warn "W: $git_dir has no root commits, skipping\n";
+ $LIVE->{$pid} = [ \&ct_fini, $self, $git, $rd, $prep_repo ];
+}
+
+sub ct_fini {
+ my ($self, $git, $rd, $prep_repo) = @_;
+ defined(my $ct = <$rd>) or return;
+ $ct =~ s/\s+.*\z//s; # drop TZ + LF
+ $git->{-repo}->{ct} = $ct + 0;
+}
+
+# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
+sub prep_repo ($$) {
+ my ($self, $git) = @_;
+ return if !$LIVE; # premature exit
+ my $repo = $git->{-repo} // die 'BUG: no {-repo}';
+ my $git_dir = $git->{git_dir};
+ if (!defined($repo->{ct})) {
+ warn "W: $git_dir has no commits, skipping\n";
+ delete $git->{-repo};
return;
}
- $ct =~ s/ .*\z//s; # drop TZ
- $repo->{ct} = $ct + 0;
my $n = git_dir_hash($git_dir) % $self->{nshard};
my $shard = $repo->{shard} = bless { %$self, shard => $n }, ref($self);
delete @$shard{qw(lockfh lock_path)};
local $shard->{xdb};
my $xdb = $shard->idx_acquire;
my @docids = docids_by_postlist($shard, 'P'.$git_dir);
- my $docid = shift(@docids) // return $repo;
+ my $docid = shift(@docids) // return get_roots($self, $git);
if (@docids) {
warn "BUG: $git_dir indexed multiple times, culling\n";
- $xdb->begin_transaction;
- for (@docids) { $xdb->delete_document($_) }
- $xdb->commit_transaction;
+ $repo->{to_delete} = \@docids; # XXX needed?
}
my $doc = $xdb->get_document($docid) //
die "BUG: no #$docid ($git_dir)";
my $old_fp = $doc->get_data;
if ($old_fp eq $repo->{fp}) { # no change
- progress($self, 'unchanged');
+ progress($self, "$git_dir unchanged");
+ delete $git->{-repo};
return;
}
$repo->{id} = $docid;
- $repo;
+ get_roots($self, $git);
}
-sub partition_refs ($$) {
- my ($self, $refs) = @_; # show-ref --heads --tags --hash output
- my $fh = $self->{git}->popen(qw(rev-list --stdin), undef,
- { 0 => $refs });
+sub partition_refs ($$$) {
+ my ($self, $git, $refs) = @_; # show-ref --heads --tags --hash output
+ sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
+ my $fh = $git->popen(qw(rev-list --stdin), undef, { 0 => $refs });
close $refs or die "close: $!";
local $self->{xdb};
my $xdb = $self->{-opt}->{reindex} ? undef : $self->xdb;
@@ -292,22 +319,27 @@ sub partition_refs ($$) {
close($fh);
if (!$? || (($? & 127) == POSIX::SIGPIPE && $seen > $SEEN_MAX)) {
$self->{nchange} += $nchange;
- progress($self, "$nchange commits");
+ progress($self, "$git->{git_dir}: $nchange commits");
for my $fh (@shard_in) {
$fh->flush or die "flush: $!";
sysseek($fh, 0, SEEK_SET) or die "seek: $!";
}
return @shard_in;
}
- die "git-rev-list: \$?=$?\n";
+ die "git --git-dir=$git->{git_dir} rev-list: \$?=$?\n";
}
-sub index_git_dir ($$) {
- my ($self, $git_dir) = @_;
- local $self->{git} = PublicInbox::Git->new($git_dir); # for ->patch_id
- my $repo = prep_git_dir($self) or return;
- local $self->{current_info} = $git_dir;
- my @shard_in = partition_refs($self, delete($repo->{refs}));
+sub index_repo {
+ my ($self, $git, $roots) = @_;
+ return if !$LIVE; # premature exit
+ my $repo = delete $git->{-repo} or return;
+ seek($roots, 0, SEEK_SET) or die "seek: $!";
+ chomp(my @roots = <$roots>);
+ close($roots) or die "close: $!";
+ @roots or return warn("E: $git->{git_dir} has no root commits\n");
+ $repo->{roots} = \@roots;
+ local $self->{current_info} = $git->{git_dir};
+ my @shard_in = partition_refs($self, $git, delete($repo->{refs}));
my %pids;
my $fwd_kill = sub {
my ($sig) = @_;
@@ -323,12 +355,13 @@ sub index_git_dir ($$) {
my $pid = fork // die "fork: $!";
if ($pid == 0) { # no RNG use, here
$0 = "code index [$n]";
+ $self->{git} = $git;
$self->{shard} = $n;
$self->{current_info} = "$self->{current_info} [$n]";
delete @$self{qw(lockfh lock_path)};
my $in = $shard_in[$n];
@shard_in = ();
- $self->{roots} = delete $repo->{roots};
+ $self->{roots} = \@roots;
undef $repo;
eval { shard_worker($self, $in, $sigset) };
warn "E: $@" if $@;
@@ -339,18 +372,41 @@ sub index_git_dir ($$) {
}
PublicInbox::DS::sig_setmask($sigset);
@shard_in = ();
- my $err;
+ my ($err, @todo);
while (keys %pids) {
- my $pid = waitpid(-1, 0) or last;
- my $j = delete $pids{$pid} // "unknown PID:$pid";
- next if $? == 0;
- warn "PID:$pid $j exited with \$?=$?\n";
- $err = 1;
+ my $pid = waitpid(-1, 0) // die "waitpid: $!";
+ if (my $j = delete $pids{$pid}) {
+ next if $? == 0;
+ warn "PID:$pid $j exited with \$?=$?\n";
+ $err = 1;
+ } elsif (my $todo = delete $LIVE->{$pid}) {
+ warn "PID:$pid exited with \$?=$?\n" if $?;
+ push @todo, $todo;
+ } else {
+ warn "reaped unknown PID=$pid ($?)\n";
+ }
}
die "subprocess(es) failed\n" if $err;
- store_repo($self, $repo);
- progress($self, 'done');
+ store_repo($self, $git, $repo);
+ progress($self, "$git->{git_dir}: done");
# TODO: check fp afterwards?
+ while (my $x = shift @todo) {
+ my $cb = shift @$x;
+ $cb->(@$x) if $cb;
+ }
+}
+
+sub get_roots ($$) {
+ my ($self, $git) = @_;
+ return if !$LIVE; # premature exit
+ cidx_reap($self, $LIVE_JOBS);
+ my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
+ sysseek($refs, 0, SEEK_SET) or die "seek: $!";
+ open my $roots, '+>', undef or die "open: $!";
+ my $pid = spawn(['git', "--git-dir=$git->{git_dir}",
+ qw(rev-list --stdin --max-parents=0)],
+ undef, { 0 => $refs, 1 => $roots });
+ $LIVE->{$pid} = [ \&index_repo, $self, $git, $roots ];
}
# for PublicInbox::SearchIdx::patch_id and with_umask
@@ -389,6 +445,21 @@ W: memory usage may be high for large indexing runs
EOM
}
+sub scan_git_dirs ($) {
+ my ($self) = @_;
+ local $LIVE_JOBS = $self->{-opt}->{jobs} //
+ PublicInbox::IPC::detect_nproc() // 2;
+ local $LIVE = {};
+ for (@{$self->{git_dirs}}) {
+ my $git = PublicInbox::Git->new($_);
+ my $prep_repo = PublicInbox::OnDestroy->new($$, \&prep_repo,
+ $self, $git);
+ fp_start($self, $git, $prep_repo);
+ ct_start($self, $git, $prep_repo);
+ }
+ cidx_reap($self, 0);
+}
+
sub cidx_run {
my ($self) = @_;
cidx_init($self);
@@ -414,11 +485,7 @@ sub cidx_run {
}
local $self->{nchange} = 0;
# do_prune($self) if $self->{-opt}->{prune}; TODO
- if ($self->{-opt}->{scan} // 1) {
- for my $gd (@{$self->{git_dirs}}) {
- index_git_dir($self, $gd);
- }
- }
+ scan_git_dirs($self) if $self->{-opt}->{scan} // 1;
$self->lock_release(!!$self->{nchange});
}
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 08/10] cindex: use read-only shards during prep phases
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
` (5 preceding siblings ...)
2023-03-16 20:01 ` [PATCH 07/10] cindex: parallelize prep phases Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 09/10] searchidxshard: improve comment wording Eric Wong
2023-03-16 20:01 ` [PATCH 10/10] cindex: use DS and workqueues for parallelism Eric Wong
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
No need to open shards for read/write access when read-only
will do. Since we also control how a document gets sharded,
we'll also access the shard directly instead of letting Xapian
do the mappings.
--reindex didn't work properly before this change since it was
over-indexing. It is now broken in the opposite way in that it
doesn't do reindexing at all. --reindex will be implemented
properly in the future.
---
lib/PublicInbox/CodeSearchIdx.pm | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index a926886e..02c9ed84 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -30,6 +30,7 @@ use PublicInbox::Spawn qw(spawn);
use PublicInbox::OnDestroy;
our $LIVE; # pid => callback
our $LIVE_JOBS;
+our @XDB_SHARDS_FLAT;
# stop walking history if we see >$SEEN_MAX existing commits, this assumes
# branches don't diverge by more than this number of commits...
@@ -273,9 +274,9 @@ sub prep_repo ($$) {
my $n = git_dir_hash($git_dir) % $self->{nshard};
my $shard = $repo->{shard} = bless { %$self, shard => $n }, ref($self);
delete @$shard{qw(lockfh lock_path)};
- local $shard->{xdb};
- my $xdb = $shard->idx_acquire;
- my @docids = docids_by_postlist($shard, 'P'.$git_dir);
+ my $xdb = $XDB_SHARDS_FLAT[$n] // die "BUG: shard[$n] undef";
+ $xdb->reopen;
+ my @docids = docids_by_postlist({ xdb => $xdb }, 'P'.$git_dir);
my $docid = shift(@docids) // return get_roots($self, $git);
if (@docids) {
warn "BUG: $git_dir indexed multiple times, culling\n";
@@ -298,19 +299,19 @@ sub partition_refs ($$$) {
sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
my $fh = $git->popen(qw(rev-list --stdin), undef, { 0 => $refs });
close $refs or die "close: $!";
- local $self->{xdb};
- my $xdb = $self->{-opt}->{reindex} ? undef : $self->xdb;
- my ($seen, $nchange, $nshard) = (0, 0, $self->{nshard});
- my @shard_in;
- for (0..($nshard - 1)) {
- open $shard_in[$_], '+>', undef or die "open: $!";
- }
+ my ($seen, $nchange) = (0, 0);
+ my @shard_in = map {
+ $_->reopen;
+ open my $fh, '+>', undef or die "open: $!";
+ $fh;
+ } @XDB_SHARDS_FLAT;
+
while (defined(my $cmt = <$fh>)) {
chomp $cmt;
- if ($xdb && seen($xdb, 'Q'.$cmt)) {
+ my $n = hex(substr($cmt, 0, 8)) % scalar(@XDB_SHARDS_FLAT);
+ if (seen($XDB_SHARDS_FLAT[$n], 'Q'.$cmt)) {
last if ++$seen > $SEEN_MAX;
} else {
- my $n = hex(substr($cmt, 0, 8)) % $nshard;
say { $shard_in[$n] } $cmt or die "say: $!";
++$nchange;
$seen = 0;
@@ -450,6 +451,7 @@ sub scan_git_dirs ($) {
local $LIVE_JOBS = $self->{-opt}->{jobs} //
PublicInbox::IPC::detect_nproc() // 2;
local $LIVE = {};
+ local @XDB_SHARDS_FLAT = $self->xdb_shards_flat;
for (@{$self->{git_dirs}}) {
my $git = PublicInbox::Git->new($_);
my $prep_repo = PublicInbox::OnDestroy->new($$, \&prep_repo,
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 09/10] searchidxshard: improve comment wording
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
` (6 preceding siblings ...)
2023-03-16 20:01 ` [PATCH 08/10] cindex: use read-only shards during " Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
2023-03-16 20:01 ` [PATCH 10/10] cindex: use DS and workqueues for parallelism Eric Wong
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
Just something I noticed while considering using this package
for CodeSearchIdx.
---
lib/PublicInbox/SearchIdxShard.pm | 7 +++----
1 file changed, 3 insertions(+), 4 deletions(-)
diff --git a/lib/PublicInbox/SearchIdxShard.pm b/lib/PublicInbox/SearchIdxShard.pm
index 000abd94..831be51b 100644
--- a/lib/PublicInbox/SearchIdxShard.pm
+++ b/lib/PublicInbox/SearchIdxShard.pm
@@ -1,11 +1,10 @@
-# Copyright (C) 2018-2021 all contributors <meta@public-inbox.org>
+# Copyright (C) all contributors <meta@public-inbox.org>
# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
# Internal interface for a single Xapian shard in V2 inboxes.
# See L<public-inbox-v2-format(5)> for more info on how we shard Xapian
package PublicInbox::SearchIdxShard;
-use strict;
-use v5.10.1;
+use v5.12;
use parent qw(PublicInbox::SearchIdx PublicInbox::IPC);
use PublicInbox::OnDestroy;
@@ -47,7 +46,7 @@ sub ipc_atfork_child { # called automatically before ipc_worker_loop
$v2w->atfork_child; # calls ipc_sibling_atfork_child on our siblings
$v2w->{current_info} = "[$self->{shard}]"; # for $SIG{__WARN__}
$self->begin_txn_lazy;
- # caller must capture this:
+ # caller (ipc_worker_spawn) must capture this:
PublicInbox::OnDestroy->new($$, \&_worker_done, $self);
}
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 10/10] cindex: use DS and workqueues for parallelism
2023-03-16 20:01 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
` (7 preceding siblings ...)
2023-03-16 20:01 ` [PATCH 09/10] searchidxshard: improve comment wording Eric Wong
@ 2023-03-16 20:01 ` Eric Wong
8 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-16 20:01 UTC (permalink / raw)
To: spew
This avoids forking new shard processes for each repo we scan,
but we can't avoid many excessive commits since we need to
ensure the `seen()' sub can avoid excessive work.
---
lib/PublicInbox/CodeSearchIdx.pm | 374 ++++++++++++++++++++-----------
1 file changed, 240 insertions(+), 134 deletions(-)
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
index 02c9ed84..13fe1c28 100644
--- a/lib/PublicInbox/CodeSearchIdx.pm
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -14,9 +14,11 @@
# See PublicInbox::CodeSearch (read-only API) for more
package PublicInbox::CodeSearchIdx;
use v5.12;
-use parent qw(PublicInbox::Lock PublicInbox::CodeSearch PublicInbox::SearchIdx);
+# parent order matters, we want ->DESTROY from IPC, not SearchIdx
+use parent qw(PublicInbox::CodeSearch PublicInbox::IPC PublicInbox::SearchIdx);
use PublicInbox::Eml;
-use PublicInbox::DS ();
+use PublicInbox::DS qw(awaitpid);
+use PublicInbox::PktOp;
use PublicInbox::IPC qw(nproc_shards);
use PublicInbox::Admin;
use POSIX qw(WNOHANG SEEK_SET);
@@ -26,11 +28,19 @@ use PublicInbox::SHA qw(sha256_hex);
use PublicInbox::Search qw(xap_terms);
use PublicInbox::SearchIdx qw(add_val);
use PublicInbox::Config;
-use PublicInbox::Spawn qw(spawn);
+use PublicInbox::Spawn qw(spawn popen_rd);
use PublicInbox::OnDestroy;
-our $LIVE; # pid => callback
-our $LIVE_JOBS;
-our @XDB_SHARDS_FLAT;
+use Socket qw(MSG_EOR);
+use Carp ();
+our (
+ $LIVE, # pid => cmd
+ $DEFER, # [ [ cb, @args ], ... ]
+ $LIVE_JOBS, # integer
+ $MY_SIG, # like %SIG
+ $SIGSET,
+ @RDONLY_SHARDS, # Xapian::Database
+ @IDX_SHARDS # clones of self
+);
# stop walking history if we see >$SEEN_MAX existing commits, this assumes
# branches don't diverge by more than this number of commits...
@@ -110,14 +120,14 @@ sub progress {
$pr->($self->{git} ? ("$self->{git}->{git_dir}: ") : (), @msg, "\n");
}
-sub store_repo ($$$) {
- my ($self, $git, $repo) = @_;
- my $xdb = delete($repo->{shard})->idx_acquire;
- $xdb->begin_transaction;
+sub store_repo { # wq_do - returns docid
+ my ($self, $repo) = @_;
+ $self->begin_txn_lazy;
+ my $xdb = $self->{xdb};
for (@{$repo->{to_delete}}) { $xdb->delete_document($_) } # XXX needed?
- if (defined $repo->{id}) {
- my $doc = $xdb->get_document($repo->{id}) //
- die "$git->{git_dir} doc #$repo->{id} gone";
+ if (defined $repo->{docid}) {
+ my $doc = $xdb->get_document($repo->{docid}) //
+ die "$repo->{git_dir} doc #$repo->{docid} gone";
add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct});
my %new = map { $_ => undef } @{$repo->{roots}};
my $old = xap_terms('G', $doc);
@@ -126,34 +136,38 @@ sub store_repo ($$$) {
delete @$old{@{$repo->{roots}}};
$doc->remove_term('G'.$_) for keys %$old;
$doc->set_data($repo->{fp});
- $xdb->replace_document($repo->{id}, $doc);
+ $xdb->replace_document($repo->{docid}, $doc);
+ $repo->{docid}
} else {
my $new = $PublicInbox::Search::X{Document}->new;
add_val($new, PublicInbox::CodeSearch::CT, $repo->{ct});
- $new->add_boolean_term("P$git->{git_dir}");
+ $new->add_boolean_term("P$repo->{git_dir}");
$new->add_boolean_term('T'.'r');
$new->add_boolean_term('G'.$_) for @{$repo->{roots}};
$new->set_data($repo->{fp}); # \n delimited
$xdb->add_document($new);
}
- $xdb->commit_transaction;
}
# sharded reader for `git log --pretty=format: --stdin'
-sub shard_worker ($$$) {
- my ($self, $r, $sigset) = @_;
+sub shard_index { # via wq_io_do
+ my ($self, $git, $n, $roots) = @_;
+ local $self->{current_info} = "$git->{git_dir} [$n]";
my ($quit, $cmt);
+ local $self->{roots} = $roots;
+ my $in = delete($self->{0}) // die 'BUG: no {0} input';
+ my $op_p = delete($self->{1}) // die 'BUG: no {1} op_p';
my $batch_bytes = $self->{-opt}->{batch_size} //
$PublicInbox::SearchIdx::BATCH_BYTES;
my $max = $batch_bytes;
- $SIG{USR1} = sub { $max = -1 }; # similar to `git fast-import'
- $SIG{QUIT} = $SIG{TERM} = $SIG{INT} = sub { $quit = shift };
- PublicInbox::DS::sig_setmask($sigset);
-
- # the parent process of this shard process writes directly to
- # the stdin of `git log', we consume git log's stdout:
- my $rd = $self->{git}->popen(@LOG_STDIN, undef, { 0 => $r });
- close $r or die "close: $!";
+ my $set_quit = sub { $quit = shift };
+ local $SIG{USR1} = sub { $max = -1 }; # similar to `git fast-import'
+ local $SIG{QUIT} = $set_quit;
+ local $SIG{TERM} = $set_quit;
+ local $SIG{INT} = $set_quit;
+ local $self->{git} = $git; # for patchid
+ my $rd = $git->popen(@LOG_STDIN, undef, { 0 => $in });
+ close $in or die "close: $!";
my $nr = 0;
# a patch may have \0, see c4201214cbf10636e2c1ab9131573f735b42c8d4
@@ -162,8 +176,7 @@ sub shard_worker ($$$) {
local $/ = $FS;
my $buf = <$rd> // return; # leading $FS
$buf eq $FS or die "BUG: not LF-NUL: $buf\n";
- my $xdb = $self->idx_acquire;
- $xdb->begin_transaction;
+ $self->begin_txn_lazy;
while (defined($buf = <$rd>)) {
chomp($buf);
$max -= length($buf);
@@ -174,24 +187,40 @@ sub shard_worker ($$$) {
++$nr;
if ($max <= 0 && !$PublicInbox::Search::X{CLOEXEC_UNSET}) {
progress($self, $nr);
- $xdb->commit_transaction;
+ $self->{xdb}->commit_transaction;
$max = $batch_bytes;
- $xdb->begin_transaction;
+ $self->{xdb}->begin_transaction;
}
$/ = $FS;
}
close($rd);
if (!$? || ($quit && ($? & 127) == POSIX::SIGPIPE)) {
- $xdb->commit_transaction;
+ send($op_p, "shard_done $n", MSG_EOR);
} else {
warn "E: git @LOG_STDIN: \$?=$?\n";
- $xdb->cancel_transaction;
+ $self->{xdb}->cancel_transaction;
}
}
+sub shard_done { # called via PktOp on shard_index completion
+ my ($self, $n) = @_;
+ $self->{-shard_ok}->{$n} = 1 if defined($self->{-shard_ok});
+}
+
sub seen ($$) {
my ($xdb, $q) = @_; # $q = "Q$COMMIT_HASH"
- $xdb->postlist_begin($q) != $xdb->postlist_end($q)
+ for (1..100) {
+ my $ret = eval {
+ $xdb->postlist_begin($q) != $xdb->postlist_end($q);
+ };
+ return $ret unless $@;
+ if (ref($@) =~ /\bDatabaseModifiedError\b/) {
+ $xdb->reopen;
+ } else {
+ Carp::croak($@);
+ }
+ }
+ Carp::croak('too many Xapian DB modifications in progress');
}
# used to select the shard for a GIT_DIR
@@ -206,18 +235,42 @@ sub docids_by_postlist ($$) { # consider moving to PublicInbox::Search
@ids;
}
+sub run_todo ($) {
+ my ($self) = @_;
+ my $n;
+ while (defined(my $x = shift(@{$self->{todo} // []}))) {
+ my $cb = shift @$x;
+ $cb->(@$x);
+ ++$n;
+ }
+ $n;
+}
+
sub cidx_reap ($$) {
my ($self, $jobs) = @_;
- while (keys(%$LIVE) >= $jobs) {
- my $pid = waitpid(-1, 0) // die "waitpid(-1): $!";
- last if $pid < 0;
- if (my $x = delete $LIVE->{$pid}) {
- my $cb = shift @$x;
- $cb->(@$x) if $cb;
- } else {
- warn "reaped unknown PID=$pid ($?)\n";
- }
+ while (run_todo($self)) {}
+ my $cb = sub { keys(%$LIVE) > $jobs };
+ PublicInbox::DS->SetPostLoopCallback($cb);
+ PublicInbox::DS::event_loop($MY_SIG, $SIGSET) while $cb->();
+ while (!$jobs && run_todo($self)) {}
+}
+
+sub cidx_await_cb { # awaitpid cb
+ my ($pid, $cb, $self, $git, @args) = @_;
+ return if !$LIVE; # premature shutdown
+ my $cmd = delete $LIVE->{$pid} // die 'BUG: no $cmd';
+ PublicInbox::DS::enqueue_reap() if !keys(%$LIVE); # once more for PLC
+ if ($?) {
+ $git->{-cidx_err} = 1;
+ return warn("@$cmd error: \$?=$?\n");
}
+ push(@$DEFER, [ $cb, $self, $git, @args ]) if $DEFER;
+}
+
+sub cidx_await ($$$$$@) {
+ my ($pid, $cmd, $cb, $self, $git, @args) = @_;
+ $LIVE->{$pid} = $cmd;
+ awaitpid($pid, \&cidx_await_cb, $cb, $self, $git, @args);
}
# this is different from the grokmirror-compatible fingerprint since we
@@ -227,13 +280,14 @@ sub fp_start ($$$) {
return if !$LIVE; # premature exit
cidx_reap($self, $LIVE_JOBS);
open my $refs, '+>', undef or die "open: $!";
- my $pid = spawn(['git', "--git-dir=$git->{git_dir}",
- qw(show-ref --heads --tags --hash)], undef, { 1 => $refs });
+ my $cmd = ['git', "--git-dir=$git->{git_dir}",
+ qw(show-ref --heads --tags --hash)];
+ my $pid = spawn($cmd, undef, { 1 => $refs });
$git->{-repo}->{refs} = $refs;
- $LIVE->{$pid} = [ \&fp_fini, $self, $git, $prep_repo ];
+ cidx_await($pid, $cmd, \&fp_fini, $self, $git, $prep_repo);
}
-sub fp_fini {
+sub fp_fini { # cidx_await cb
my ($self, $git, $prep_repo) = @_;
my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
seek($refs, 0, SEEK_SET) or die "seek: $!";
@@ -247,13 +301,15 @@ sub ct_start ($$$) {
my ($self, $git, $prep_repo) = @_;
return if !$LIVE; # premature exit
cidx_reap($self, $LIVE_JOBS);
- my ($rd, $pid) = $git->popen([qw[for-each-ref --sort=-committerdate
+ my $cmd = [ 'git', "--git-dir=$git->{git_dir}",
+ qw[for-each-ref --sort=-committerdate
--format=%(committerdate:raw) --count=1
- refs/heads/ refs/tags/]]);
- $LIVE->{$pid} = [ \&ct_fini, $self, $git, $rd, $prep_repo ];
+ refs/heads/ refs/tags/] ];
+ my ($rd, $pid) = popen_rd($cmd);
+ cidx_await($pid, $cmd, \&ct_fini, $self, $git, $rd, $prep_repo);
}
-sub ct_fini {
+sub ct_fini { # cidx_await cb
my ($self, $git, $rd, $prep_repo) = @_;
defined(my $ct = <$rd>) or return;
$ct =~ s/\s+.*\z//s; # drop TZ + LF
@@ -263,34 +319,38 @@ sub ct_fini {
# TODO: also index gitweb.owner and the full fingerprint for grokmirror?
sub prep_repo ($$) {
my ($self, $git) = @_;
- return if !$LIVE; # premature exit
+ return if !$LIVE || $git->{-cidx_err}; # premature exit
my $repo = $git->{-repo} // die 'BUG: no {-repo}';
- my $git_dir = $git->{git_dir};
if (!defined($repo->{ct})) {
- warn "W: $git_dir has no commits, skipping\n";
+ warn "W: $git->{git_dir} has no commits, skipping\n";
delete $git->{-repo};
return;
}
- my $n = git_dir_hash($git_dir) % $self->{nshard};
- my $shard = $repo->{shard} = bless { %$self, shard => $n }, ref($self);
+ my $n = git_dir_hash($git->{git_dir}) % $self->{nshard};
+ my $shard = bless { %$self, shard => $n }, ref($self);
+ $repo->{shard_n} = $n;
delete @$shard{qw(lockfh lock_path)};
- my $xdb = $XDB_SHARDS_FLAT[$n] // die "BUG: shard[$n] undef";
- $xdb->reopen;
- my @docids = docids_by_postlist({ xdb => $xdb }, 'P'.$git_dir);
+ local $shard->{xdb} = $RDONLY_SHARDS[$n] // die "BUG: shard[$n] undef";
+ $shard->retry_reopen(\&check_existing, $self, $git);
+}
+
+sub check_existing { # retry_reopen callback
+ my ($shard, $self, $git) = @_;
+ my @docids = docids_by_postlist($shard, 'P'.$git->{git_dir});
my $docid = shift(@docids) // return get_roots($self, $git);
- if (@docids) {
- warn "BUG: $git_dir indexed multiple times, culling\n";
- $repo->{to_delete} = \@docids; # XXX needed?
- }
- my $doc = $xdb->get_document($docid) //
- die "BUG: no #$docid ($git_dir)";
+ my $doc = $shard->{xdb}->get_document($docid) //
+ die "BUG: no #$docid ($git->{git_dir})";
my $old_fp = $doc->get_data;
- if ($old_fp eq $repo->{fp}) { # no change
- progress($self, "$git_dir unchanged");
+ if ($old_fp eq $git->{-repo}->{fp}) { # no change
+ progress($self, "$git->{git_dir} unchanged");
delete $git->{-repo};
return;
}
- $repo->{id} = $docid;
+ $git->{-repo}->{docid} = $docid;
+ if (@docids) {
+ warn "BUG: $git->{git_dir} indexed multiple times, culling\n";
+ $git->{-repo}->{to_delete} = \@docids; # XXX needed?
+ }
get_roots($self, $git);
}
@@ -304,12 +364,12 @@ sub partition_refs ($$$) {
$_->reopen;
open my $fh, '+>', undef or die "open: $!";
$fh;
- } @XDB_SHARDS_FLAT;
+ } @RDONLY_SHARDS;
while (defined(my $cmt = <$fh>)) {
chomp $cmt;
- my $n = hex(substr($cmt, 0, 8)) % scalar(@XDB_SHARDS_FLAT);
- if (seen($XDB_SHARDS_FLAT[$n], 'Q'.$cmt)) {
+ my $n = hex(substr($cmt, 0, 8)) % scalar(@RDONLY_SHARDS);
+ if (seen($RDONLY_SHARDS[$n], 'Q'.$cmt)) {
last if ++$seen > $SEEN_MAX;
} else {
say { $shard_in[$n] } $cmt or die "say: $!";
@@ -330,9 +390,33 @@ sub partition_refs ($$$) {
die "git --git-dir=$git->{git_dir} rev-list: \$?=$?\n";
}
-sub index_repo {
+sub shard_commit { # via wq_io_do
+ my ($self, $n) = @_;
+ my $op_p = delete($self->{0}) // die 'BUG: no {0} op_p';
+ $self->commit_txn_lazy;
+ send($op_p, "shard_done $n", MSG_EOR);
+}
+
+sub commit_used_shards ($$$) {
+ my ($self, $git, $consumers) = @_;
+ local $self->{-shard_ok} = {};
+ for my $n (keys %$consumers) {
+ my ($c, $p) = PublicInbox::PktOp->pair;
+ $c->{ops}->{shard_done} = [ $self ];
+ $IDX_SHARDS[$n]->wq_io_do('shard_commit', [ $p->{op_p} ], $n);
+ $consumers->{$n} = $c;
+ }
+ PublicInbox::DS->SetPostLoopCallback(sub {
+ scalar(grep { $_->{sock} } values %$consumers);
+ });
+ PublicInbox::DS::event_loop($MY_SIG, $SIGSET);
+ my $n = grep { ! $self->{-shard_ok}->{$_} } keys %$consumers;
+ die "E: $git->{git_dir} $n shards failed" if $n;
+}
+
+sub index_repo { # cidx_await cb
my ($self, $git, $roots) = @_;
- return if !$LIVE; # premature exit
+ return if $git->{-cidx_err};
my $repo = delete $git->{-repo} or return;
seek($roots, 0, SEEK_SET) or die "seek: $!";
chomp(my @roots = <$roots>);
@@ -341,73 +425,45 @@ sub index_repo {
$repo->{roots} = \@roots;
local $self->{current_info} = $git->{git_dir};
my @shard_in = partition_refs($self, $git, delete($repo->{refs}));
- my %pids;
- my $fwd_kill = sub {
- my ($sig) = @_;
- kill($sig, $_) for keys %pids;
- };
- local $SIG{USR1} = $fwd_kill;
- local $SIG{QUIT} = $fwd_kill;
- local $SIG{INT} = $fwd_kill;
- local $SIG{TERM} = $fwd_kill;
- my $sigset = PublicInbox::DS::block_signals();
- for (my $n = 0; $n <= $#shard_in; $n++) {
+ local $self->{-shard_ok} = {}; # [0..$#shard_in] => 1
+ my %CONSUMERS;
+ for my $n (0..$#shard_in) {
-s $shard_in[$n] or next;
- my $pid = fork // die "fork: $!";
- if ($pid == 0) { # no RNG use, here
- $0 = "code index [$n]";
- $self->{git} = $git;
- $self->{shard} = $n;
- $self->{current_info} = "$self->{current_info} [$n]";
- delete @$self{qw(lockfh lock_path)};
- my $in = $shard_in[$n];
- @shard_in = ();
- $self->{roots} = \@roots;
- undef $repo;
- eval { shard_worker($self, $in, $sigset) };
- warn "E: $@" if $@;
- POSIX::_exit($@ ? 1 : 0);
- } else {
- $pids{$pid} = "code index [$n]";
- }
+ my ($c, $p) = PublicInbox::PktOp->pair;
+ $c->{ops}->{shard_done} = [ $self ];
+ $IDX_SHARDS[$n]->wq_io_do('shard_index',
+ [ $shard_in[$n], $p->{op_p} ],
+ $git, $n, \@roots);
+ $CONSUMERS{$n} = $c;
}
- PublicInbox::DS::sig_setmask($sigset);
@shard_in = ();
- my ($err, @todo);
- while (keys %pids) {
- my $pid = waitpid(-1, 0) // die "waitpid: $!";
- if (my $j = delete $pids{$pid}) {
- next if $? == 0;
- warn "PID:$pid $j exited with \$?=$?\n";
- $err = 1;
- } elsif (my $todo = delete $LIVE->{$pid}) {
- warn "PID:$pid exited with \$?=$?\n" if $?;
- push @todo, $todo;
- } else {
- warn "reaped unknown PID=$pid ($?)\n";
- }
- }
- die "subprocess(es) failed\n" if $err;
- store_repo($self, $git, $repo);
- progress($self, "$git->{git_dir}: done");
- # TODO: check fp afterwards?
- while (my $x = shift @todo) {
- my $cb = shift @$x;
- $cb->(@$x) if $cb;
+ PublicInbox::DS->SetPostLoopCallback(sub {
+ scalar(grep { $_->{sock} } values %CONSUMERS);
+ });
+ PublicInbox::DS::event_loop($MY_SIG, $SIGSET);
+ my $n = grep { ! $self->{-shard_ok}->{$_} } keys %CONSUMERS;
+ die "E: $git->{git_dir} $n shards failed" if $n;
+ $repo->{git_dir} = $git->{git_dir};
+ my $id = $IDX_SHARDS[$repo->{shard_n}]->wq_do('store_repo', $repo);
+ if ($id > 0) {
+ $CONSUMERS{$repo->{shard_n}} = undef;
+ commit_used_shards($self, $git, \%CONSUMERS);
+ progress($self, "$git->{git_dir}: done");
+ return run_todo($self);
}
+ die "E: store_repo $git->{git_dir}: id=$id";
}
sub get_roots ($$) {
my ($self, $git) = @_;
return if !$LIVE; # premature exit
- cidx_reap($self, $LIVE_JOBS);
my $refs = $git->{-repo}->{refs} // die 'BUG: no {-repo}->{refs}';
sysseek($refs, 0, SEEK_SET) or die "seek: $!";
open my $roots, '+>', undef or die "open: $!";
- my $pid = spawn(['git', "--git-dir=$git->{git_dir}",
- qw(rev-list --stdin --max-parents=0)],
- undef, { 0 => $refs, 1 => $roots });
- $LIVE->{$pid} = [ \&index_repo, $self, $git, $roots ];
+ my $cmd = [ 'git', "--git-dir=$git->{git_dir}",
+ qw(rev-list --stdin --max-parents=0) ];
+ my $pid = spawn($cmd, undef, { 0 => $refs, 1 => $roots });
+ cidx_await($pid, $cmd, \&index_repo, $self, $git, $roots);
}
# for PublicInbox::SearchIdx::patch_id and with_umask
@@ -434,9 +490,17 @@ sub cidx_init ($) {
warn "# creating $dir\n" if !$self->{-opt}->{quiet};
File::Path::mkpath($dir);
}
+ $self->lock_acquire;
+ my @shards;
for my $n (0..($self->{nshard} - 1)) {
my $shard = bless { %$self, shard => $n }, ref($self);
+ delete @$shard{qw(lockfh lock_path)};
$shard->idx_acquire;
+ $shard->idx_release;
+ $shard->wq_workers_start("shard[$n]", 1, undef, {
+ siblings => \@shards, # for ipc_atfork_child
+ }, \&shard_done_wait, $self);
+ push @shards, $shard;
}
# this warning needs to happen after idx_acquire
state $once;
@@ -444,14 +508,11 @@ sub cidx_init ($) {
W: Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks,
W: memory usage may be high for large indexing runs
EOM
+ @shards;
}
sub scan_git_dirs ($) {
my ($self) = @_;
- local $LIVE_JOBS = $self->{-opt}->{jobs} //
- PublicInbox::IPC::detect_nproc() // 2;
- local $LIVE = {};
- local @XDB_SHARDS_FLAT = $self->xdb_shards_flat;
for (@{$self->{git_dirs}}) {
my $git = PublicInbox::Git->new($_);
my $prep_repo = PublicInbox::OnDestroy->new($$, \&prep_repo,
@@ -462,18 +523,31 @@ sub scan_git_dirs ($) {
cidx_reap($self, 0);
}
-sub cidx_run {
+sub shards_active { # PostLoopCallback
+ scalar(grep { $_->{-cidx_quit} } @IDX_SHARDS);
+}
+
+sub cidx_run { # main entry point
my ($self) = @_;
- cidx_init($self);
+ local $self->{todo} = [];
+ local $DEFER = $self->{todo};
+ local $SIGSET = PublicInbox::DS::block_signals();
+ my $restore = PublicInbox::OnDestroy->new($$,
+ \&PublicInbox::DS::sig_setmask, $SIGSET);
+ local $LIVE = {};
+ local @IDX_SHARDS = cidx_init($self);
local $self->{current_info} = '';
my $cb = $SIG{__WARN__} || \&CORE::warn;
+ local $MY_SIG = {
+ CHLD => \&PublicInbox::DS::enqueue_reap,
+ INT => sub { exit },
+ };
local $SIG{__WARN__} = sub {
my $m = shift @_;
$self->{current_info} eq '' or
$m =~ s/\A(#?\s*)/$1$self->{current_info}: /;
$cb->($m, @_);
};
- $self->lock_acquire;
load_existing($self);
my @nc = grep { File::Spec->canonpath($_) ne $_ } @{$self->{git_dirs}};
if (@nc) {
@@ -486,9 +560,41 @@ sub cidx_run {
warn "E: canonicalized and attempting to continue\n";
}
local $self->{nchange} = 0;
+ local $LIVE_JOBS = $self->{-opt}->{jobs} ||
+ PublicInbox::IPC::detect_nproc() || 2;
+ local @RDONLY_SHARDS = $self->xdb_shards_flat;
+
# do_prune($self) if $self->{-opt}->{prune}; TODO
scan_git_dirs($self) if $self->{-opt}->{scan} // 1;
+
+ for my $s (@IDX_SHARDS) {
+ $s->{-cidx_quit} = 1;
+ $s->wq_close;
+ }
+
+ PublicInbox::DS->SetPostLoopCallback(\&shards_active);
+ PublicInbox::DS::event_loop($MY_SIG, $SIGSET) if shards_active();
$self->lock_release(!!$self->{nchange});
}
+sub ipc_atfork_child {
+ my ($self) = @_;
+ $self->SUPER::ipc_atfork_child;
+ my $x = delete $self->{siblings} // die 'BUG: no {siblings}';
+ $_->wq_close for @$x;
+}
+
+sub shard_done_wait { # awaitpid cb via ipc_worker_reap
+ my ($pid, $shard, $self) = @_;
+ delete($shard->{-cidx_quit}) // warn 'BUG: {-cidx_quit} unset';
+ return unless $?;
+ warn "PID:$pid $shard->{shard} exited with \$?=$?\n";
+ ++$self->{shard_err} if defined($self->{shard_err});
+}
+
+sub with_umask { # TODO
+ my ($self, $cb, @arg) = @_;
+ $cb->(@arg);
+}
+
1;
^ permalink raw reply related [flat|nested] 11+ messages in thread
* [PATCH 06/10] codesearch: initial cut w/ -cindex tool
2023-03-15 10:10 [PATCH 01/10] ipc: move nproc_shards from v2writable Eric Wong
@ 2023-03-15 10:11 ` Eric Wong
0 siblings, 0 replies; 11+ messages in thread
From: Eric Wong @ 2023-03-15 10:11 UTC (permalink / raw)
To: spew
It seems relying on root commits is a reasonable way to
deduplicate and handle repositories with common history.
I initially wanted to shoehorn this into extindex, but decided a
separate Xapian index layout capable of being both external to
handle many forks and internal (in $GIT_DIR/public-inbox-cindex)
for small projects is the right way to go.
Unlike most existing parts of public-inbox, this relies on
absolute paths of $GIT_DIR stored in the Xapian DB and does not
rely on the config file. We'll be relying on the config file to
map absolute paths to public URL paths for WWW.
---
MANIFEST | 4 +
lib/PublicInbox/CodeSearch.pm | 121 +++++++++
lib/PublicInbox/CodeSearchIdx.pm | 425 +++++++++++++++++++++++++++++++
lib/PublicInbox/MiscIdx.pm | 2 +-
lib/PublicInbox/Search.pm | 63 +++--
lib/PublicInbox/SearchIdx.pm | 88 ++++---
lib/PublicInbox/TestCommon.pm | 41 ++-
lib/PublicInbox/ViewVCS.pm | 2 +-
script/public-inbox-cindex | 75 ++++++
t/cindex.t | 98 +++++++
10 files changed, 850 insertions(+), 69 deletions(-)
create mode 100644 lib/PublicInbox/CodeSearch.pm
create mode 100644 lib/PublicInbox/CodeSearchIdx.pm
create mode 100755 script/public-inbox-cindex
create mode 100644 t/cindex.t
diff --git a/MANIFEST b/MANIFEST
index 7437bb54..1499e096 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -160,6 +160,8 @@ lib/PublicInbox/AltId.pm
lib/PublicInbox/AutoReap.pm
lib/PublicInbox/Cgit.pm
lib/PublicInbox/CmdIPC4.pm
+lib/PublicInbox/CodeSearch.pm
+lib/PublicInbox/CodeSearchIdx.pm
lib/PublicInbox/CompressNoop.pm
lib/PublicInbox/Config.pm
lib/PublicInbox/ConfigIter.pm
@@ -362,6 +364,7 @@ sa_config/README
sa_config/root/etc/spamassassin/public-inbox.pre
sa_config/user/.spamassassin/user_prefs
script/lei
+script/public-inbox-cindex
script/public-inbox-clone
script/public-inbox-compact
script/public-inbox-convert
@@ -401,6 +404,7 @@ t/altid.t
t/altid_v2.t
t/cgi.t
t/check-www-inbox.perl
+t/cindex.t
t/clone-coderepo-puh1.sh
t/clone-coderepo-puh2.sh
t/clone-coderepo.psgi
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
new file mode 100644
index 00000000..1dfc124f
--- /dev/null
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -0,0 +1,121 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# read-only external index for coderepos
+# currently, it only indexes commits and repository metadata
+# (pathname, root commits); not blob contents
+package PublicInbox::CodeSearch;
+use v5.12;
+use parent qw(PublicInbox::Search);
+use PublicInbox::Search qw(retry_reopen int_val xap_terms);
+use constant {
+ AT => 0, # author time YYYYMMDDHHMMSS, dt: for mail)
+ CT => 1, # commit time (Unix time stamp, like TS/rt: in mail)
+ CIDX_SCHEMA_VER => 1, # brand new schema for code search
+ # for repos (`Tr'), CT(col=1) is used for the latest tip commit time
+ # in refs/{heads,tags}. AT(col=0) may be used to store disk usage
+ # in the future, but disk usage calculation is espensive w/ alternates
+};
+
+# note: the non-X term prefix allocations are shared with Xapian omega,
+# see xapian-applications/omega/docs/termprefixes.rst
+# bool_pfx_internal:
+# type => 'T', # 'c' - commit, 'r' - repo GIT_DIR
+# tags are not indexed, only normal branches (refs/heads/*), not hidden
+# 'P' # (pathname) GIT_DIR # uniq
+# 'G' # (group) root commit (may have multiple roots)
+my %bool_pfx_external = (
+ oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256)
+ # type:repo - rel2abs_collapsed(GIT_DIR)
+ parent => 'XP',
+ %PublicInbox::Search::PATCH_BOOL_COMMON,
+);
+
+my %prob_prefix = ( # copied from PublicInbox::Search
+ # do we care about committer? or partial commit OID via Xapian?
+ # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial
+ %PublicInbox::Search::PATCH_PROB_COMMON,
+
+ # default:
+ '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY
+);
+
+sub new {
+ my ($cls, $dir) = @_;
+ bless { xpfx => "$dir/cidx".CIDX_SCHEMA_VER }, $cls;
+}
+
+sub cqparse_new ($) {
+ my ($self) = @_;
+ my $qp = $self->qp_init_common;
+ my $cb = $qp->can('add_valuerangeprocessor') //
+ $qp->can('add_rangeprocessor'); # Xapian 1.5.0+
+ $cb->($qp, $PublicInbox::Search::NVRP->new(AT, 'd:')); # mairix compat
+ $cb->($qp, $PublicInbox::Search::NVRP->new(AT, 'dt:')); # mail compat
+ $cb->($qp, $PublicInbox::Search::NVRP->new(CT, 'ct:'));
+
+ while (my ($name, $pfx) = each %bool_pfx_external) {
+ $qp->add_boolean_prefix($name, $_) for split(/ /, $pfx);
+ }
+ while (my ($name, $pfx) = each %prob_prefix) {
+ $qp->add_prefix($name, $_) for split(/ /, $pfx);
+ }
+ $qp;
+}
+
+# returns a Xapian::Query to filter by roots
+sub roots_filter { # retry_reopen callback
+ my ($self, $git_dir) = @_;
+ my $xdb = $self->xdb;
+ my $P = 'P'.$git_dir;
+ my ($cur, $end) = ($xdb->postlist_begin($P), $xdb->postlist_end($P));
+ if ($cur == $end) {
+ warn "W: $git_dir not indexed?\n";
+ return;
+ }
+ my @roots = xap_terms('G', $xdb, $cur->get_docid);
+ if (!@roots) {
+ warn "W: $git_dir has no root commits?\n";
+ return;
+ }
+ my $q = $PublicInbox::Search::X{Query}->new('G'.shift(@roots));
+ for my $r (@roots) {
+ $q = $PublicInbox::Search::X{Query}->new(
+ PublicInbox::Search::OP_OR(),
+ $q, 'G'.$r);
+ }
+ $q;
+}
+
+sub mset {
+ my ($self, $qry_str, $opt) = @_;
+ my $qp = $self->{qp} //= cqparse_new($self);
+ my $qry = $qp->parse_query($qry_str, $self->{qp_flags});
+
+ # limit to commits with shared roots
+ if (defined(my $git_dir = $opt->{git_dir})) {
+ my $rf = retry_reopen($self, \&roots_filter, $git_dir)
+ or return;
+
+ $qry = $PublicInbox::Search::X{Query}->new(
+ PublicInbox::Search::OP_FILTER(),
+ $qry, $rf);
+ }
+
+ # we only want commits:
+ $qry = $PublicInbox::Search::X{Query}->new(
+ PublicInbox::Search::OP_FILTER(),
+ $qry, 'T'.'c');
+
+ my $enq = $PublicInbox::Search::X{Enquire}->new($self->xdb);
+ $enq->set_query($qry);
+ if ($opt->{relevance}) {
+ $enq->set_sort_by_relevance_then_value(CT, !$opt->{asc});
+ } else {
+ $enq->set_sort_by_value_then_relevance(CT, !$opt->{asc});
+ }
+ $self->retry_reopen($self->can('enquire_once'), $enq,
+ $opt->{offset} || 0, $opt->{limit} || 50);
+}
+
+1;
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
new file mode 100644
index 00000000..218338da
--- /dev/null
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -0,0 +1,425 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# indexer for git coderepos, just commits and repo paths for now
+# this stores normalized absolute paths of indexed GIT_DIR inside
+# the DB itself and is designed to handle forks by designating roots
+#
+# Unlike mail search, docid isn't tied to NNTP artnum or IMAP UID,
+# there's no serial number dependency at all. The first 32-bits of
+# the commit SHA-(1|256) is used to select a shard.
+#
+# We shard repos using the first 32-bits of sha256($ABS_GIT_DIR)
+#
+# See PublicInbox::CodeSearch (read-only API) for more
+package PublicInbox::CodeSearchIdx;
+use v5.12;
+use parent qw(PublicInbox::Lock PublicInbox::CodeSearch PublicInbox::SearchIdx);
+use PublicInbox::Eml;
+use PublicInbox::DS ();
+use PublicInbox::IPC qw(nproc_shards);
+use PublicInbox::Admin;
+use POSIX qw(WNOHANG SEEK_SET);
+use File::Path ();
+use File::Spec ();
+use PublicInbox::SHA qw(sha256_hex);
+use PublicInbox::Search qw(xap_terms);
+use PublicInbox::SearchIdx qw(add_val);
+use PublicInbox::Config;
+use PublicInbox::Spawn qw(run_die);
+
+# stop walking history if we see >$SEEN_MAX existing commits, this assumes
+# branches don't diverge by more than this number of commits...
+# git walks commits quickly if it doesn't have to read trees
+our $SEEN_MAX = 100000;
+
+# TODO: do we care about committer name + email? or tree OID?
+my @FMT = qw(H P ct an ae at s b); # (b)ody must be last
+my @LOG_STDIN = (qw(log --no-decorate --no-color --no-notes -p --stat -M
+ --stdin --no-walk=unsorted), '--pretty=format:%n%x00'.
+ join('%n', map { "%$_" } @FMT));
+
+sub new {
+ my (undef, $dir, $opt) = @_;
+ my $l = $opt->{indexlevel} // 'full';
+ $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and
+ die "invalid indexlevel=$l\n";
+ $l eq 'basic' and die "E: indexlevel=basic not supported\n";
+ my $self = bless {
+ xpfx => "$dir/cidx". PublicInbox::CodeSearch::CIDX_SCHEMA_VER,
+ cidx_dir => $dir,
+ creat => 1, # TODO: get rid of this, should be implicit
+ indexlevel => $l,
+ transact_bytes => 0, # for checkpoint
+ total_bytes => 0, # for lock_release
+ current_info => '',
+ parallel => 1,
+ -opt => $opt,
+ lock_path => "$dir/cidx.lock",
+ }, __PACKAGE__;
+ $self->{nshard} = count_shards($self) ||
+ nproc_shards({nproc => $opt->{jobs}});
+ $self->{-no_fsync} = 1 if !$opt->{fsync};
+ $self->{-dangerous} = 1 if $opt->{dangerous};
+ $self;
+}
+
+# TODO: may be used for reshard/compact
+sub count_shards { scalar($_[0]->xdb_shards_flat) }
+
+sub add_commit ($$) {
+ my ($self, $cmt) = @_; # fields from @FMT
+ my $x = 'Q'.$cmt->{H};
+ for (docids_by_postlist($self, $x)) {
+ $self->{xdb}->delete_document($_)
+ }
+ my $doc = $PublicInbox::Search::X{Document}->new;
+ $doc->add_boolean_term($x);
+ $doc->add_boolean_term('G'.$_) for @{$self->{roots}};
+ $doc->add_boolean_term('XP'.$_) for split(/ /, $cmt->{P});
+ $doc->add_boolean_term('T'.'c');
+
+ # Author-Time is compatible with dt: for mail search schema_version=15
+ add_val($doc, PublicInbox::CodeSearch::AT,
+ POSIX::strftime('%Y%m%d%H%M%S', gmtime($cmt->{at})));
+
+ # Commit-Time is the fallback used by rt: (TS) for mail search:
+ add_val($doc, PublicInbox::CodeSearch::CT, $cmt->{ct});
+
+ $self->term_generator->set_document($doc);
+
+ # email address is always indexed with positional data for usability
+ $self->index_phrase("$cmt->{an} <$cmt->{ae}>", 1, 'A');
+
+ $x = $cmt->{'s'};
+ $self->index_text($x, 1, 'S') if $x =~ /\S/s;
+ $doc->set_data($x); # subject is the first (and currently only) line
+
+ $x = delete $cmt->{b};
+ $self->index_body_text($doc, \$x) if $x =~ /\S/s;
+ $self->{xdb}->add_document($doc);
+}
+
+sub progress {
+ my ($self, @msg) = @_;
+ my $pr = $self->{-opt}->{-progress} or return;
+ $pr->($self->{git} ? ("$self->{git}->{git_dir}: ") : (), @msg, "\n");
+}
+
+sub store_repo ($$) {
+ my ($self, $repo) = @_;
+ my $xdb = delete($repo->{shard})->idx_acquire;
+ $xdb->begin_transaction;
+ if (defined $repo->{id}) {
+ my $doc = $xdb->get_document($repo->{id}) //
+ die "$self->{git}->{git_dir} doc #$repo->{id} gone";
+ add_val($doc, PublicInbox::CodeSearch::CT, $repo->{ct});
+ my %new = map { $_ => undef } @{$self->{roots}};
+ my $old = xap_terms('G', $doc);
+ delete @new{keys %$old};
+ $doc->add_boolean_term('G'.$_) for keys %new;
+ delete @$old{@{$self->{roots}}};
+ $doc->remove_term('G'.$_) for keys %$old;
+ $doc->set_data($repo->{fp});
+ $xdb->replace_document($repo->{id}, $doc);
+ } else {
+ my $new = $PublicInbox::Search::X{Document}->new;
+ add_val($new, PublicInbox::CodeSearch::CT, $repo->{ct});
+ $new->add_boolean_term("P$self->{git}->{git_dir}");
+ $new->add_boolean_term('T'.'r');
+ $new->add_boolean_term('G'.$_) for @{$repo->{roots}};
+ $new->set_data($repo->{fp}); # \n delimited
+ $xdb->add_document($new);
+ }
+ $xdb->commit_transaction;
+}
+
+# sharded reader for `git log --pretty=format: --stdin'
+sub shard_worker ($$$) {
+ my ($self, $r, $sigset) = @_;
+ my ($quit, $cmt);
+ my $batch_bytes = $self->{-opt}->{batch_size} //
+ $PublicInbox::SearchIdx::BATCH_BYTES;
+ my $max = $batch_bytes;
+ $SIG{USR1} = sub { $max = -1 }; # similar to `git fast-import'
+ $SIG{QUIT} = $SIG{TERM} = $SIG{INT} = sub { $quit = shift };
+ PublicInbox::DS::sig_setmask($sigset);
+
+ # the parent process of this shard process writes directly to
+ # the stdin of `git log', we consume git log's stdout:
+ my $rd = $self->{git}->popen(@LOG_STDIN, undef, { 0 => $r });
+ close $r or die "close: $!";
+ my $nr = 0;
+
+ # a patch may have \0, see c4201214cbf10636e2c1ab9131573f735b42c8d4
+ # in linux.git, so we use $/ = "\n\0" to check end-of-patch
+ my $FS = "\n\0";
+ local $/ = $FS;
+ my $buf = <$rd> // return; # leading $FS
+ $buf eq $FS or die "BUG: not LF-NUL: $buf\n";
+ my $xdb = $self->idx_acquire;
+ $xdb->begin_transaction;
+ while (defined($buf = <$rd>)) {
+ chomp($buf);
+ $max -= length($buf);
+ @$cmt{@FMT} = split(/\n/, $buf, scalar(@FMT));
+ $/ = "\n";
+ add_commit($self, $cmt);
+ last if $quit; # likely SIGPIPE
+ ++$nr;
+ if ($max <= 0 && !$PublicInbox::Search::X{CLOEXEC_UNSET}) {
+ progress($self, $nr);
+ $xdb->commit_transaction;
+ $max = $batch_bytes;
+ $xdb->begin_transaction;
+ }
+ $/ = $FS;
+ }
+ close($rd);
+ if (!$? || ($quit && ($? & 127) == POSIX::SIGPIPE)) {
+ $xdb->commit_transaction;
+ } else {
+ warn "E: git @LOG_STDIN: \$?=$?\n";
+ $xdb->cancel_transaction;
+ }
+}
+
+sub seen ($$) {
+ my ($xdb, $q) = @_; # $q = "Q$COMMIT_HASH"
+ $xdb->postlist_begin($q) != $xdb->postlist_end($q)
+}
+
+# used to select the shard for a GIT_DIR
+sub git_dir_hash ($) { hex(substr(sha256_hex($_[0]), 0, 8)) }
+
+sub docids_by_postlist ($$) { # consider moving to PublicInbox::Search
+ my ($self, $q) = @_;
+ my $cur = $self->{xdb}->postlist_begin($q);
+ my $end = $self->{xdb}->postlist_end($q);
+ my @ids;
+ for (; $cur != $end; $cur++) { push(@ids, $cur->get_docid) };
+ @ids;
+}
+
+sub get_roots ($$) {
+ my ($self, $refs) = @_;
+ my @roots = $self->{git}->qx([qw(rev-list --stdin --max-parents=0)],
+ undef, { 0 => $refs });
+ die "git rev-list \$?=$?" if $?;
+ sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
+ chomp(@roots);
+ scalar(@roots) ? \@roots : undef;
+}
+
+# this is different from the grokmirror-compatible fingerprint since we
+# only care about --heads (branches) and --tags, and not even their names
+sub cidx_fp ($) {
+ my ($self) = @_;
+ open my $refs, '+>', undef or die "open: $!";
+ run_die(['git', "--git-dir=$self->{git}->{git_dir}",
+ qw(show-ref --heads --tags --hash)], undef, { 1 => $refs });
+ seek($refs, 0, SEEK_SET) or die "seek: $!";
+ my $buf;
+ my $dig = PublicInbox::SHA->new(256);
+ while (read($refs, $buf, 65536)) { $dig->add($buf) }
+ sysseek($refs, 0, SEEK_SET) or die "seek: $!"; # for rev-list --stdin
+ ($dig->hexdigest, $refs);
+}
+
+# TODO: should we also index gitweb.owner and the full fingerprint for grokmirror?
+sub prep_git_dir ($) {
+ my ($self) = @_;
+ my $git_dir = $self->{git}->{git_dir};
+ my $ct = $self->{git}->qx([qw[for-each-ref
+ --sort=-committerdate --format=%(committerdate:raw) --count=1
+ refs/heads/ refs/tags/]]);
+ my $repo = {};
+ @$repo{qw(fp refs)} = cidx_fp($self);
+ $repo->{roots} = get_roots($self, $repo->{refs});
+ if (!$repo->{roots} || !defined($ct)) {
+ warn "W: $git_dir has no root commits, skipping\n";
+ return;
+ }
+ $ct =~ s/ .*\z//s; # drop TZ
+ $repo->{ct} = $ct + 0;
+ my $n = git_dir_hash($git_dir) % $self->{nshard};
+ my $shard = $repo->{shard} = bless { %$self, shard => $n }, ref($self);
+ delete @$shard{qw(lockfh lock_path)};
+ local $shard->{xdb};
+ my $xdb = $shard->idx_acquire;
+ my @docids = docids_by_postlist($shard, 'P'.$git_dir);
+ my $docid = shift(@docids) // return $repo;
+ if (@docids) {
+ warn "BUG: $git_dir indexed multiple times, culling\n";
+ $xdb->begin_transaction;
+ for (@docids) { $xdb->delete_document($_) }
+ $xdb->commit_transaction;
+ }
+ my $doc = $xdb->get_document($docid) //
+ die "BUG: no #$docid ($git_dir)";
+ my $old_fp = $doc->get_data;
+ if ($old_fp eq $repo->{fp}) { # no change
+ progress($self, 'unchanged');
+ return;
+ }
+ $repo->{id} = $docid;
+ $repo;
+}
+
+sub partition_refs ($$) {
+ my ($self, $refs) = @_; # show-ref --heads --tags --hash output
+ my $fh = $self->{git}->popen(qw(rev-list --stdin), undef,
+ { 0 => $refs });
+ close $refs or die "close: $!";
+ local $self->{xdb};
+ my $xdb = $self->{-opt}->{reindex} ? undef : $self->xdb;
+ my ($seen, $nchange, $nshard) = (0, 0, $self->{nshard});
+ my @shard_in;
+ for (0..($nshard - 1)) {
+ open $shard_in[$_], '+>', undef or die "open: $!";
+ }
+ while (defined(my $cmt = <$fh>)) {
+ chomp $cmt;
+ if ($xdb && seen($xdb, 'Q'.$cmt)) {
+ last if ++$seen > $SEEN_MAX;
+ } else {
+ my $n = hex(substr($cmt, 0, 8)) % $nshard;
+ say { $shard_in[$n] } $cmt or die "say: $!";
+ ++$nchange;
+ $seen = 0;
+ }
+ }
+ close($fh);
+ if (!$? || (($? & 127) == POSIX::SIGPIPE && $seen > $SEEN_MAX)) {
+ $self->{nchange} += $nchange;
+ progress($self, "$nchange commits");
+ for my $fh (@shard_in) {
+ $fh->flush or die "flush: $!";
+ sysseek($fh, 0, SEEK_SET) or die "seek: $!";
+ }
+ return @shard_in;
+ }
+ die "git-rev-list: \$?=$?\n";
+}
+
+sub index_git_dir ($$) {
+ my ($self, $git_dir) = @_;
+ local $self->{git} = PublicInbox::Git->new($git_dir); # for ->patch_id
+ my $repo = prep_git_dir($self) or return;
+ local $self->{current_info} = $git_dir;
+ my @shard_in = partition_refs($self, delete($repo->{refs}));
+ my %pids;
+ my $fwd_kill = sub {
+ my ($sig) = @_;
+ kill($sig, $_) for keys %pids;
+ };
+ local $SIG{USR1} = $fwd_kill;
+ local $SIG{QUIT} = $fwd_kill;
+ local $SIG{INT} = $fwd_kill;
+ local $SIG{TERM} = $fwd_kill;
+ my $sigset = PublicInbox::DS::block_signals();
+ for (my $n = 0; $n <= $#shard_in; $n++) {
+ -s $shard_in[$n] or next;
+ my $pid = fork // die "fork: $!";
+ if ($pid == 0) { # no RNG use, here
+ $0 = "code index [$n]";
+ $self->{shard} = $n;
+ $self->{current_info} = "$self->{current_info} [$n]";
+ delete @$self{qw(lockfh lock_path)};
+ my $in = $shard_in[$n];
+ @shard_in = ();
+ $self->{roots} = delete $repo->{roots};
+ undef $repo;
+ eval { shard_worker($self, $in, $sigset) };
+ warn "E: $@" if $@;
+ POSIX::_exit($@ ? 1 : 0);
+ } else {
+ $pids{$pid} = "code index [$n]";
+ }
+ }
+ PublicInbox::DS::sig_setmask($sigset);
+ @shard_in = ();
+ my $err;
+ while (keys %pids) {
+ my $pid = waitpid(-1, 0) or last;
+ my $j = delete $pids{$pid} // "unknown PID:$pid";
+ next if $? == 0;
+ warn "PID:$pid $j exited with \$?=$?\n";
+ $err = 1;
+ }
+ die "subprocess(es) failed\n" if $err;
+ store_repo($self, $repo);
+ progress($self, 'done');
+ # TODO: check fp afterwards?
+}
+
+# for PublicInbox::SearchIdx::patch_id and with_umask
+sub git { $_[0]->{git} }
+
+sub load_existing ($) { # for -u/--update
+ my ($self) = @_;
+ my $dirs = $self->{git_dirs} // [];
+ if ($self->{-opt}->{update}) {
+ local $self->{xdb};
+ $self->xdb or
+ die "E: $self->{cidx_dir} non-existent for --update\n";
+ my @cur = $self->all_terms('P');
+ push @$dirs, @cur;
+ }
+ my %uniq; # List::Util::uniq requires Perl 5.26+
+ @$dirs = grep { !$uniq{$_}++ } @$dirs;
+}
+
+sub cidx_init ($) {
+ my ($self) = @_;
+ my $dir = $self->{cidx_dir};
+ unless (-d $dir) {
+ warn "# creating $dir\n" if !$self->{-opt}->{quiet};
+ File::Path::mkpath($dir);
+ }
+ for my $n (0..($self->{nshard} - 1)) {
+ my $shard = bless { %$self, shard => $n }, ref($self);
+ $shard->idx_acquire;
+ }
+ # this warning needs to happen after idx_acquire
+ state $once;
+ warn <<EOM if $PublicInbox::Search::X{CLOEXEC_UNSET} && !$once++;
+W: Xapian v1.2.21..v1.2.24 were missing close-on-exec on OFD locks,
+W: memory usage may be high for large indexing runs
+EOM
+}
+
+sub cidx_run {
+ my ($self) = @_;
+ cidx_init($self);
+ local $self->{current_info} = '';
+ my $cb = $SIG{__WARN__} || \&CORE::warn;
+ local $SIG{__WARN__} = sub {
+ my $m = shift @_;
+ $self->{current_info} eq '' or
+ $m =~ s/\A(#?\s*)/$1$self->{current_info}: /;
+ $cb->($m, @_);
+ };
+ $self->lock_acquire;
+ load_existing($self);
+ my @nc = grep { File::Spec->canonpath($_) ne $_ } @{$self->{git_dirs}};
+ if (@nc) {
+ warn "E: BUG? paths in $self->{cidx_dir} not canonicalized:\n";
+ for my $d (@{$self->{git_dirs}}) {
+ my $c = File::Spec->canonpath($_);
+ warn "E: $d => $c\n";
+ $d = $c;
+ }
+ warn "E: canonicalized and attempting to continue\n";
+ }
+ local $self->{nchange} = 0;
+ # do_prune($self) if $self->{-opt}->{prune}; TODO
+ if ($self->{-opt}->{scan} // 1) {
+ for my $gd (@{$self->{git_dirs}}) {
+ index_git_dir($self, $gd);
+ }
+ }
+ $self->lock_release(!!$self->{nchange});
+}
+
+1;
diff --git a/lib/PublicInbox/MiscIdx.pm b/lib/PublicInbox/MiscIdx.pm
index 19200b92..6708527d 100644
--- a/lib/PublicInbox/MiscIdx.pm
+++ b/lib/PublicInbox/MiscIdx.pm
@@ -5,7 +5,7 @@
# Things indexed include:
# * inboxes themselves
# * epoch information
-# * (maybe) git code repository information
+# * (maybe) git code repository information (not commits)
# Expect ~100K-1M documents with no parallelism opportunities,
# so no sharding, here.
#
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 7aba2445..5133a3b7 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -110,43 +110,50 @@ sub load_xapian () {
# a prefix common in patch emails
our $LANG = 'english';
+our %PATCH_BOOL_COMMON = (
+ dfpre => 'XDFPRE',
+ dfpost => 'XDFPOST',
+ dfblob => 'XDFPRE XDFPOST',
+ patchid => 'XDFID',
+);
+
# note: the non-X term prefix allocations are shared with
# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
my %bool_pfx_external = (
mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
lid => 'G', # newsGroup (or similar entity), just inside <>
- dfpre => 'XDFPRE',
- dfpost => 'XDFPOST',
- dfblob => 'XDFPRE XDFPOST',
- patchid => 'XDFID',
+ %PATCH_BOOL_COMMON
);
-my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
-my %prob_prefix = (
- # for mairix compatibility
+# for mairix compatibility
+our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
+our %PATCH_PROB_COMMON = (
s => 'S',
- m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
- l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
f => 'A',
- t => 'XTO',
- tc => 'XTO XCC',
- c => 'XCC',
- tcf => 'XTO XCC A',
- a => 'XTO XCC A',
- b => $non_quoted_body . ' XQUOT',
- bs => $non_quoted_body . ' XQUOT S',
+ b => $NON_QUOTED_BODY . ' XQUOT',
+ bs => $NON_QUOTED_BODY . ' XQUOT S',
n => 'XFN',
q => 'XQUOT',
- nq => $non_quoted_body,
+ nq => $NON_QUOTED_BODY,
dfn => 'XDFN',
dfa => 'XDFA',
dfb => 'XDFB',
dfhh => 'XDFHH',
dfctx => 'XDFCTX',
+);
+my %prob_prefix = (
+ m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+ l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
+ t => 'XTO',
+ tc => 'XTO XCC',
+ c => 'XCC',
+ tcf => 'XTO XCC A',
+ a => 'XTO XCC A',
+ %PATCH_PROB_COMMON,
# default:
- '' => 'XM S A XQUOT XFN ' . $non_quoted_body,
+ '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY,
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
@@ -305,7 +312,7 @@ sub date_parse_prepare {
$x = "\0%Y%m%d%H%M%S$#$to_parse\0";
}
}
- } else { # "rt", let git interpret "YYYY", deal with Y10K later :P
+ } else { # (rt|ct), let git interpret "YYYY", deal with Y10K later :P
for my $x (@r) {
next if $x eq '' || $x =~ /\A[0-9]{5,}\z/;
push @$to_parse, $x;
@@ -454,20 +461,24 @@ sub mset_to_smsg {
# read-write
sub stemmer { $X{Stem}->new($LANG) }
-# read-only
-sub qparse_new {
+sub qp_init_common {
my ($self) = @_;
-
- my $xdb = xdb($self);
my $qp = $X{QueryParser}->new;
$qp->set_default_op(OP_AND());
- $qp->set_database($xdb);
+ $qp->set_database(xdb($self));
$qp->set_stemmer(stemmer($self));
$qp->set_stemming_strategy(STEM_SOME());
my $cb = $qp->can('set_max_wildcard_expansion') //
$qp->can('set_max_expansion'); # Xapian 1.5.0+
$cb->($qp, 100);
- $cb = $qp->can('add_valuerangeprocessor') //
+ $qp;
+}
+
+# read-only
+sub qparse_new {
+ my ($self) = @_;
+ my $qp = qp_init_common($self);
+ my $cb = $qp->can('add_valuerangeprocessor') //
$qp->can('add_rangeprocessor'); # Xapian 1.5.0+
$cb->($qp, $NVRP->new(YYYYMMDD, 'd:'));
$cb->($qp, $NVRP->new(DT, 'dt:'));
@@ -546,7 +557,7 @@ sub xap_terms ($$;@) {
}
# get combined docid from over.num:
-# (not generic Xapian, only works with our sharding scheme)
+# (not generic Xapian, only works with our sharding scheme for mail)
sub num2docid ($$) {
my ($self, $num) = @_;
my $nshard = $self->{nshard};
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index fc464383..3baeaa9c 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -134,6 +134,7 @@ sub idx_acquire {
load_xapian_writable();
$flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN;
}
+ my $owner = $self->{ibx} // $self->{eidx} // $self;
if ($self->{creat}) {
require File::Path;
$self->lock_acquire;
@@ -145,14 +146,13 @@ sub idx_acquire {
File::Path::mkpath($dir);
require PublicInbox::Syscall;
PublicInbox::Syscall::nodatacow_dir($dir);
- $self->{-set_has_threadid_once} = 1;
- if (($self->{ibx} // $self->{eidx})->{-dangerous}) {
- $flag |= $DB_DANGEROUS;
- }
+ # owner == self for CodeSearchIdx
+ $self->{-set_has_threadid_once} = 1 if $owner != $self;
+ $flag |= $DB_DANGEROUS if $owner->{-dangerous};
}
}
return unless defined $flag;
- $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync};
+ $flag |= $DB_NO_SYNC if $owner->{-no_fsync};
my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) };
croak "Failed opening $dir: $@" if $@;
$self->{xdb} = $xdb;
@@ -350,43 +350,30 @@ sub index_diff ($$$) {
index_text($self, join("\n", @$xnq), 1, 'XNQ');
}
-sub index_xapian { # msg_iter callback
- my $part = $_[0]->[0]; # ignore $depth and $idx
- my ($self, $doc) = @{$_[1]};
- my $ct = $part->content_type || 'text/plain';
- my $fn = $part->filename;
- if (defined $fn && $fn ne '') {
- index_phrase($self, $fn, 1, 'XFN');
- }
- if ($part->{is_submsg}) {
- my $mids = mids_for_index($part);
- index_ids($self, $doc, $part, $mids);
- my $smsg = bless {}, 'PublicInbox::Smsg';
- $smsg->populate($part);
- index_headers($self, $smsg);
- }
-
- my ($s, undef) = msg_part_text($part, $ct);
- defined $s or return;
- $_[0]->[0] = $part = undef; # free memory
+sub patch_id {
+ my ($self) = @_; # $_[1] is the diff (may be huge)
+ open(my $fh, '+>:utf8', undef) or die "open: $!";
+ open(my $eh, '+>', undef) or die "open: $!";
+ $fh->autoflush(1);
+ print $fh $_[1] or die "print: $!";
+ sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
+ my $id = ($self->{ibx} // $self->{eidx} // $self)->git->qx(
+ [qw(patch-id --stable)], {}, { 0 => $fh, 2 => $eh });
+ seek($eh, 0, SEEK_SET) or die "seek: $!";
+ while (<$eh>) { warn $_ }
+ $id =~ /\A([a-f0-9]{40,})/ ? $1 : undef;
+}
- if ($s =~ /^(?:diff|---|\+\+\+) /ms) {
- open(my $fh, '+>:utf8', undef) or die "open: $!";
- open(my $eh, '+>', undef) or die "open: $!";
- $fh->autoflush(1);
- print $fh $s or die "print: $!";
- sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
- my $id = ($self->{ibx} // $self->{eidx})->git->qx(
- [qw(patch-id --stable)],
- {}, { 0 => $fh, 2 => $eh });
- $id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1);
- seek($eh, 0, SEEK_SET) or die "seek: $!";
- while (<$eh>) { warn $_ }
+sub index_body_text {
+ my ($self, $doc, $sref) = @_;
+ if ($$sref =~ /^(?:diff|---|\+\+\+) /ms) {
+ my $id = patch_id($self, $$sref);
+ $doc->add_term('XDFID'.$id) if defined($id);
}
# split off quoted and unquoted blocks:
- my @sections = PublicInbox::MsgIter::split_quotes($s);
- undef $s; # free memory
+ my @sections = PublicInbox::MsgIter::split_quotes($$sref);
+ undef $$sref; # free memory
for my $txt (@sections) {
if ($txt =~ /\A>/) {
if ($txt =~ /^[>\t ]+GIT binary patch\r?/sm) {
@@ -396,8 +383,7 @@ sub index_xapian { # msg_iter callback
(?:[>\h]+$BASE85\h*\r?\n)+/$1/gsmx;
}
index_text($self, $txt, 0, 'XQUOT');
- } else {
- # does it look like a diff?
+ } else { # does it look like a diff?
if ($txt =~ /^(?:diff|---|\+\+\+) /ms) {
index_diff($self, \$txt, $doc);
} else {
@@ -408,6 +394,28 @@ sub index_xapian { # msg_iter callback
}
}
+sub index_xapian { # msg_iter callback
+ my $part = $_[0]->[0]; # ignore $depth and $idx
+ my ($self, $doc) = @{$_[1]};
+ my $ct = $part->content_type || 'text/plain';
+ my $fn = $part->filename;
+ if (defined $fn && $fn ne '') {
+ index_phrase($self, $fn, 1, 'XFN');
+ }
+ if ($part->{is_submsg}) {
+ my $mids = mids_for_index($part);
+ index_ids($self, $doc, $part, $mids);
+ my $smsg = bless {}, 'PublicInbox::Smsg';
+ $smsg->populate($part);
+ index_headers($self, $smsg);
+ }
+
+ my ($s, undef) = msg_part_text($part, $ct);
+ defined $s or return;
+ $_[0]->[0] = $part = undef; # free memory
+ index_body_text($self, $doc, \$s);
+}
+
sub index_list_id ($$$) {
my ($self, $doc, $hdr) = @_;
for my $l ($hdr->header_raw('List-Id')) {
diff --git a/lib/PublicInbox/TestCommon.pm b/lib/PublicInbox/TestCommon.pm
index ed28ac48..494323c0 100644
--- a/lib/PublicInbox/TestCommon.pm
+++ b/lib/PublicInbox/TestCommon.pm
@@ -21,6 +21,7 @@ BEGIN {
@EXPORT = qw(tmpdir tcp_server tcp_connect require_git require_mods
run_script start_script key2sub xsys xsys_e xqx eml_load tick
have_xapian_compact json_utf8 setup_public_inboxes create_inbox
+ create_coderepo
tcp_host_port test_lei lei lei_ok $lei_out $lei_err $lei_opt
test_httpd xbail require_cmd is_xdeeply tail_f
ignore_inline_c_missing);
@@ -325,7 +326,7 @@ sub run_script ($;$$) {
}
}
my $tail = @tail_paths ? tail_f(@tail_paths) : undef;
- if ($key =~ /-(index|convert|extindex|convert|xcpdb)\z/) {
+ if ($key =~ /-(index|cindex|extindex|convert|xcpdb)\z/) {
unshift @argv, '--no-fsync';
}
if ($run_mode == 0) {
@@ -698,6 +699,44 @@ sub setup_public_inboxes () {
@ret;
}
+our %COMMIT_ENV = (
+ GIT_AUTHOR_NAME => 'A U Thor',
+ GIT_COMMITTER_NAME => 'C O Mitter',
+ GIT_AUTHOR_EMAIL => 'a@example.com',
+ GIT_COMMITTER_EMAIL => 'c@example.com',
+);
+
+sub create_coderepo ($$;@) {
+ my $ident = shift;
+ my $cb = pop;
+ my %opt = @_;
+ require PublicInbox::Lock;
+ require PublicInbox::Import;
+ my ($base) = ($0 =~ m!\b([^/]+)\.[^\.]+\z!);
+ my ($db) = (PublicInbox::Import::default_branch() =~ m!([^/]+)\z!);
+ my $dir = "t/data-gen/$base.$ident-$db";
+ my $new = !-d $dir;
+ if ($new && !mkdir($dir)) {
+ my $err = $!;
+ -d $dir or xbail "mkdir($dir): $err";
+ }
+ my $lk = bless { lock_path => "$dir/creat.lock" }, 'PublicInbox::Lock';
+ my $scope = $lk->lock_for_scope;
+ my $tmpdir = delete $opt{tmpdir};
+ if (!-f "$dir/creat.stamp") {
+ opendir(my $dfh, '.') or xbail "opendir .: $!";
+ chdir($dir) or xbail "chdir($dir): $!";
+ local %ENV = (%ENV, %COMMIT_ENV);
+ $cb->($dir);
+ chdir($dfh) or xbail "cd -: $!";
+ open my $s, '>', "$dir/creat.stamp" or
+ BAIL_OUT "error creating $dir/creat.stamp: $!";
+ }
+ return $dir if !defined($tmpdir);
+ xsys_e([qw(/bin/cp -Rp), $dir, $tmpdir]);
+ $tmpdir;
+}
+
sub create_inbox ($$;@) {
my $ident = shift;
my $cb = pop;
diff --git a/lib/PublicInbox/ViewVCS.pm b/lib/PublicInbox/ViewVCS.pm
index de8600ee..716582a6 100644
--- a/lib/PublicInbox/ViewVCS.pm
+++ b/lib/PublicInbox/ViewVCS.pm
@@ -34,7 +34,7 @@ my $hl = eval {
my %QP_MAP = ( A => 'oid_a', a => 'path_a', b => 'path_b' );
our $MAX_SIZE = 1024 * 1024; # TODO: configurable
-my $BIN_DETECT = 8000; # same as git
+my $BIN_DETECT = 8000; # same as git (buffer_is_binary())
my $SHOW_FMT = '--pretty=format:'.join('%n', '%P', '%p', '%H', '%T', '%s', '%f',
'%an <%ae> %ai', '%cn <%ce> %ci', '%b%x00');
diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex
new file mode 100755
index 00000000..d3a5bfca
--- /dev/null
+++ b/script/public-inbox-cindex
@@ -0,0 +1,75 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
+usage: public-inbox-cindex [options] GIT_DIR...
+usage: public-inbox-cindex [options] --project-list=FILE PROJECT_ROOT
+
+ Create and update search indices for code repos
+
+ -d DIR use DIR instead of GIT_DIR/public-inbox-cindex
+ --no-fsync speed up indexing, risk corruption on power outage
+ -L LEVEL `medium', or `full' (default: medium)
+ --project-list=FILE use a cgit/gitweb-compatible list of projects
+ --update | -u update previously-indexed code repos with `-d'
+ --jobs=NUM set or disable parallelization (NUM=0)
+ --batch-size=BYTES flush changes to OS after a given number of bytes
+ --prune prune old repos and commits
+ --reindex reindex previously indexed repos
+ --verbose | -v increase verbosity (may be repeated)
+
+BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
+See public-inbox-cindex(1) man page for full documentation.
+EOF
+my $opt = { fsync => 1, scan => 1 }; # --no-scan is hidden
+GetOptions($opt, qw(quiet|q verbose|v+ reindex jobs|j=i fsync|sync! dangerous
+ indexlevel|index-level|L=s batch_size|batch-size=s
+ project-list=s
+ d=s update|u scan! prune dry-run|n C=s@ help|h))
+ or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in cidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+PublicInbox::Admin::do_chdir(delete $opt->{C});
+my $cfg = PublicInbox::Config->new;
+my $cidx_dir = $opt->{d};
+PublicInbox::Admin::require_or_die('Search::Xapian');
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+%ENV = (%ENV, %$env) if $env;
+
+require PublicInbox::CodeSearchIdx; # unstable internal API
+my @git_dirs;
+if (defined(my $pl = $opt->{'project-list'})) {
+ my $pfx = shift @ARGV // die <<EOM;
+PROJECTS_DIR required for --project-list
+EOM
+ open my $fh, '<', $pl or die "open($pl): $!\n";
+ chomp(@git_dirs = <$fh>);
+ $_ = PublicInbox::Admin::resolve_git_dir("$pfx/$_") for @git_dirs;
+} else {
+ @git_dirs = map { PublicInbox::Admin::resolve_git_dir($_) } @ARGV;
+}
+if (defined $cidx_dir) { # external index
+ die "`%' is not allowed in $cidx_dir\n" if $cidx_dir =~ /\%/;
+ my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt);
+ @{$cidx->{git_dirs}} = @git_dirs; # may be empty
+ $cidx->cidx_run;
+} elsif (!@git_dirs) {
+ die $help
+} else {
+ for my $gd (@git_dirs) {
+ my $cd = "$gd/public-inbox-cindex";
+ my $cidx = PublicInbox::CodeSearchIdx->new($cd, { %$opt });
+ $cidx->{-internal} = 1;
+ @{$cidx->{git_dirs}} = ($gd);
+ $cidx->cidx_run;
+ }
+}
diff --git a/t/cindex.t b/t/cindex.t
new file mode 100644
index 00000000..c93e4e4e
--- /dev/null
+++ b/t/cindex.t
@@ -0,0 +1,98 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use PublicInbox::TestCommon;
+use Cwd qw(getcwd abs_path);
+require_mods(qw(json Search::Xapian));
+use_ok 'PublicInbox::CodeSearchIdx';
+require PublicInbox::Import;
+my ($tmp, $for_destroy) = tmpdir();
+my $pwd = getcwd();
+
+# I reworked CodeSearchIdx->shard_worker to handle empty trees
+# in the initial commit generated by cvs2svn for xapian.git
+create_coderepo 'empty-tree-root', tmpdir => "$tmp/wt0", sub {
+ xsys_e([qw(/bin/sh -c), <<'EOM']);
+git init -q &&
+tree=$(git mktree </dev/null) &&
+head=$(git symbolic-ref HEAD) &&
+cmt=$(echo 'empty root' | git commit-tree $tree) &&
+git update-ref $head $cmt &&
+echo hi >f &&
+git add f &&
+git commit -q -m hi &&
+git gc -q
+EOM
+}; # /create_coderepo
+
+ok(run_script([qw(-cindex --dangerous -q), "$tmp/wt0"]), 'cindex internal');
+ok(-e "$tmp/wt0/.git/public-inbox-cindex/cidx.lock", 'internal dir created');
+
+
+# it's possible for git to emit NUL characters in diffs
+# (see c4201214cbf10636e2c1ab9131573f735b42c8d4 in linux.git)
+my $zp = create_coderepo 'NUL in patch', sub {
+ require PublicInbox::Git;
+ my $src = PublicInbox::Git::try_cat("$pwd/COPYING");
+ xsys_e([qw(git init -q)]);
+
+ # needs to be further than FIRST_FEW_BYTES (8000) in git.git
+ $src =~ s/\b(Limitation of Liability\.)\n\n/$1\n\0\n/s or
+ xbail "BUG: no `\\n\\n' in $pwd/COPYING";
+
+ open my $fh, '>', 'f' or xbail "open: $!";
+ print $fh $src or xbail "print: $!";
+ close $fh or xbail "close: $!";
+ xsys_e([qw(/bin/sh -c), <<'EOM']);
+git add f &&
+git commit -q -m 'initial with NUL character'
+EOM
+ $src =~ s/\n\0\n/\n\n/ or xbail "BUG: no `\\n\\0\\n'";
+ open $fh, '>', 'f' or xbail "open: $!";
+ print $fh $src or xbail "print: $!";
+ close $fh or xbail "close: $!";
+ xsys_e([qw(/bin/sh -c), <<'EOM']);
+git add f &&
+git commit -q -m 'remove NUL character' &&
+git gc -q
+EOM
+}; # /create_coderepo
+
+ok(run_script([qw(-cindex --dangerous -q -d), "$tmp/ext", $zp, "$tmp/wt0"]),
+ 'cindex external');
+ok(-e "$tmp/ext/cidx.lock", 'external dir created');
+ok(!-d "$zp/.git/public-inbox-cindex", 'no cindex in original coderepo');
+
+use_ok 'PublicInbox::CodeSearch';
+if ('multi-repo search') {
+ my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+ my $mset = $csrch->mset('NUL');
+ is(scalar($mset->items), 2, 'got results');
+ my $exp = [ 'initial with NUL character', 'remove NUL character' ];
+ my @have = sort(map { $_->get_document->get_data } $mset->items);
+ is_xdeeply(\@have, $exp, 'got expected subjects');
+
+ $mset = $csrch->mset('NUL', { git_dir => "$tmp/wt0/.git" });
+ is(scalar($mset->items), 0, 'no results with other GIT_DIR');
+
+ $mset = $csrch->mset('NUL', { git_dir => abs_path("$zp/.git") });
+ @have = sort(map { $_->get_document->get_data } $mset->items);
+ is_xdeeply(\@have, $exp, 'got expected subjects w/ GIT_DIR filter');
+}
+
+if ('--update') {
+ my $csrch = PublicInbox::CodeSearch->new("$tmp/ext");
+ my $mset = $csrch->mset('dfn:for-update');
+ is(scalar($mset->items), 0, 'no result before update');
+
+ my $e = \%PublicInbox::TestCommon::COMMIT_ENV;
+ xsys_e([qw(/bin/sh -c), <<'EOM'], $e, { -C => "$tmp/wt0" });
+>for-update && git add for-update && git commit -q -m updated
+EOM
+ ok(run_script([qw(-cindex -qu -d), "$tmp/ext"]), '-cindex -u');
+ $mset = $csrch->reopen->mset('dfn:for-update');
+ is(scalar($mset->items), 1, 'got updated result');
+}
+
+done_testing;
^ permalink raw reply related [flat|nested] 11+ messages in thread