From: Eric Wong <e@80x24.org>
To: spew@80x24.org
Subject: [PATCH 2/2] WIP-reposearchidx
Date: Fri, 17 Feb 2023 04:01:13 +0000 [thread overview]
Message-ID: <20230217040113.111644-2-e@80x24.org> (raw)
In-Reply-To: <20230217040113.111644-1-e@80x24.org>
WIP
---
MANIFEST | 2 +
lib/PublicInbox/CodeSearch.pm | 44 ++++++++++
lib/PublicInbox/CodeSearchIdx.pm | 146 +++++++++++++++++++++++++++++++
lib/PublicInbox/Search.pm | 43 +++++----
lib/PublicInbox/SearchIdx.pm | 38 ++++----
script/public-inbox-cindex | 70 +++++++++++++++
6 files changed, 307 insertions(+), 36 deletions(-)
create mode 100644 lib/PublicInbox/CodeSearch.pm
create mode 100644 lib/PublicInbox/CodeSearchIdx.pm
create mode 100755 script/public-inbox-cindex
diff --git a/MANIFEST b/MANIFEST
index aba32762..6c068b18 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -161,6 +161,7 @@ lib/PublicInbox/AltId.pm
lib/PublicInbox/AutoReap.pm
lib/PublicInbox/Cgit.pm
lib/PublicInbox/CmdIPC4.pm
+lib/PublicInbox/CodeSearchIdx.pm
lib/PublicInbox/CompressNoop.pm
lib/PublicInbox/Config.pm
lib/PublicInbox/ConfigIter.pm
@@ -363,6 +364,7 @@ sa_config/README
sa_config/root/etc/spamassassin/public-inbox.pre
sa_config/user/.spamassassin/user_prefs
script/lei
+script/public-inbox-cindex
script/public-inbox-clone
script/public-inbox-compact
script/public-inbox-convert
diff --git a/lib/PublicInbox/CodeSearch.pm b/lib/PublicInbox/CodeSearch.pm
new file mode 100644
index 00000000..d7dfdb08
--- /dev/null
+++ b/lib/PublicInbox/CodeSearch.pm
@@ -0,0 +1,44 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+
+# read-only external index for coderepos.
+package PublicInbox::CodeSearch;
+use v5.12;
+use parent qw(PublicInbox::Search);
+use constant {
+ AT => 0, # author time
+ CT => 1, # commit time # should we even care to index this?
+ SCHEMA_VERSION => 1,
+};
+
+# bool_pfx_internal:
+# type => 'T', # 'c' - commit, 'r' - repo GIT_DIR
+# tags are not indexed, only normal branches (refs/heads/*), not hidden
+
+# note: the non-X term prefix allocations are shared with
+# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
+my %bool_pfx_external = (
+ oid => 'Q', # type:commit - git OID hex (40|64)-byte SHA-(1|256)
+ # type:repo - rel2abs_collapsed(GIT_DIR)
+ %COMMIT_BOOL_COMMON
+ # we use O/eidx_key to store GIT_DIR key
+);
+
+my %prob_prefix = ( # copied from PublicInbox::Search
+ # do we care about committer? or partial commit OID?
+ # o => 'XQ', # 'oid:' (bool) is exact, 'o:' (prob) can do partial
+ %PublicInbox::Search::COMMIT_PROB_COMMON,
+
+ # default:
+ '' => 'S A XQUOT XFN ' . $PublicInbox::Search::NON_QUOTED_BODY
+);
+
+# read-only
+sub mset {
+ my ($self, $query_string, $opt) = @_;
+ my $qp = $self->{qp} //= cqparse_new($self);
+ my $query = $qp->parse_query($query_string, $self->{qp_flags});
+}
+
+
+1;
diff --git a/lib/PublicInbox/CodeSearchIdx.pm b/lib/PublicInbox/CodeSearchIdx.pm
new file mode 100644
index 00000000..8ba2214b
--- /dev/null
+++ b/lib/PublicInbox/CodeSearchIdx.pm
@@ -0,0 +1,146 @@
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+#
+# indexer for git coderepos, just commits for now
+package PublicInbox::CodeSearchIdx;
+use v5.12;
+use parent qw(PublicInbox::Lock PublicInbox::SearchIdx);
+use Socket qw(AF_UNIX MSG_EOR SOCK_SEQPACKET);
+use constant FROM_DATE => length(" Mon Sep 17 00:00:00 2001\n");
+use PublicInbox::Eml;
+use PublicInbox::DS ();
+use PublicInbox::InboxWritable ();
+use PublicInbox::IPC qw(nproc_shards);
+use PublicInbox::Admin;
+use POSIX qw(WNOHANG);
+
+our $SEEN_MAX = 100000; # w/o reading trees, git walks commits quickly
+my @FMT = qw(H cn ce ct an ae at s b);
+my @LOG = (qw(log --all --no-decorate --no-color --no-notes -p -M
+ --stdin --no-walk=unsorted), '--pretty=format:%x00'.
+ join('%n', map { "%$_" } @FMT).'%x00');
+
+sub new {
+ my (undef, $dir, $opt) = @_;
+ my $l = $opt->{indexlevel} // 'full';
+ $l !~ $PublicInbox::SearchIdx::INDEXLEVELS and
+ die "invalid indexlevel=$l\n";
+ $l eq 'basic' and die "E: indexlevel=basic not supported\n";
+ my $self = bless {
+ xpfx => "$dir/c".PublicInbox::CodeSearch::SCHEMA_VERSION,
+ cidx_dir => $dir,
+ creat => $opt->{creat},
+ indexlevel => $l,
+ transact_bytes => 0,
+ total_bytes => 0,
+ current_info => '',
+ parallel => 1,
+ lock_path => "$dir/cidx.lock",
+ }, __PACKAGE__;
+ $self->{nshard} = $self->count_shards ||
+ nproc_shards({nproc => $opt->{jobs}});
+ $self->{-no_fsync} = 1 if !$opt->{fsync};
+ $self->{-dangerous} = 1 if $opt->{dangerous};
+ $self;
+}
+
+# TODO: may be used for reshard/compact
+sub count_shards { scalar($_[0]->xdb_shards_flat) }
+
+# sharded reader for `git log --pretty=format: --stdin'
+sub commit_reader {
+ my ($self, $r, $c) = @_; # $c = SOCK_SEQPACKET consumer
+ my ($H, $ct, $buf);
+
+ # the parent process of this shard process writes directly to
+ # the stdin of `git log', we consume git log's stdout:
+ my $rd = $self->{git}->popen(@LOG, undef, { 0 => $r });
+ close $r or die "close: $!";
+
+ local $/ = "\0";
+ my $buf = <$rd> // return; # leading "\0"
+ my (%c, $hdr);
+ while (defined($hdr = <$rd>)) {
+ chomp($hdr);
+ @c{@FMT} = split(/\n/, scalar(@FMT));
+ chomp($c{patch} = <$rd>);
+ use Data::Dumper;
+ warn Dumper(\%c);
+ # $self->add_xapian($eml, $smsg, [ $H ]);
+ }
+}
+
+# this is a bit of weird pipe+process structure unseen anywhere else in
+# our code base, but maybe LeiToMail can take advantage of it someday
+sub stream_log {
+ my ($self) = @_;
+ my (%pids, @pipes, $err);
+ my $reap = sub {
+ my ($sig) = @_;
+ do {
+ my $pid = waitpid(-1, $sig ? WNOHANG : 0) or return;
+ return if $pid < 0;
+ my $j = delete $pids{$pid} // "unknown PID:$pid";
+ if ($?) {
+ warn "$j exited with \$?=$?\n";
+ $err = 1;
+ }
+ } while ($sig);
+ };
+
+ local $SIG{CHLD} = $reap;
+ my $nshard = $self->{nshard};
+ for my $n (0..($nshard - 1)) {
+ pipe(my ($r, $w)) or die "pipe: $!";
+ $w->autoflush(1);
+ push @pipes, $w;
+ my $sigset = PublicInbox::DS::block_signals();
+ my $pid = fork // die "fork: $!";
+ if ($pid == 0) {
+ $0 = "code index [$n]";
+ for (@pipes) { close($_) or die "close: $!" }
+ for (qw(TTOU TTIN TERM QUIT INT CHLD)) {
+ $SIG{$_} = 'DEFAULT';
+ }
+ PublicInbox::DS::sig_setmask($sigset);
+ eval { commit_reader($self, $r) };
+ warn "E: $@" if $@;
+ POSIX::_exit($@ ? 1 : 0);
+ } else {
+ $pids{$pid} = "code index [$n]";
+ close($r) or die "close: $!";
+ }
+ }
+
+ # children all running, now feed them anything we haven't seen:
+ my $fh = $self->{git}->popen(qw(rev-list --all));
+ my ($H, $n, $seen);
+ while (defined($H = <$fh>)) {
+ chomp $H;
+ if (seen($self, $H)) {
+ last if ++$seen > $SEEN_MAX;
+ } else {
+ $n = hex(substr($H, 0, 4)) % $nshard;
+ say { $pipes[$n] } $H or die "say: $!";
+ $seen = 0;
+ }
+ }
+ close $fh or die "close: $!";
+ for (@pipes) { close($_) or die "close: $!" }
+ $reap->() while %pids;
+ die "subprocess(es) failed\n" if $err;
+}
+
+sub git { $_[0]->{git} } # for PublicInbox::SearchIdx
+
+sub cidx_sync { # main entry point
+ my ($self, $opt) = @_;
+ my $warn_cb = $SIG{__WARN__} || \&CORE::warn;
+ local $self->{current_info} = '';
+}
+
+# bypass PublicInbox::SearchIdx::with_umask:
+no warnings 'once';
+*with_umask = \&PublicInbox::InboxWritable::with_umask;
+
+1;
diff --git a/lib/PublicInbox/Search.pm b/lib/PublicInbox/Search.pm
index 86219dfe..c51663ff 100644
--- a/lib/PublicInbox/Search.pm
+++ b/lib/PublicInbox/Search.pm
@@ -110,43 +110,50 @@ sub load_xapian () {
# a prefix common in patch emails
our $LANG = 'english';
+our %COMMIT_BOOL_COMMON = (
+ dfpre => 'XDFPRE',
+ dfpost => 'XDFPOST',
+ dfblob => 'XDFPRE XDFPOST',
+ patchid => 'XDFID',
+);
+
# note: the non-X term prefix allocations are shared with
# Xapian omega, see xapian-applications/omega/docs/termprefixes.rst
my %bool_pfx_external = (
mid => 'Q', # Message-ID (full/exact), this is mostly uniQue
lid => 'G', # newsGroup (or similar entity), just inside <>
- dfpre => 'XDFPRE',
- dfpost => 'XDFPOST',
- dfblob => 'XDFPRE XDFPOST',
- patchid => 'XDFID',
+ %COMMIT_BOOL_COMMON
);
-my $non_quoted_body = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
-my %prob_prefix = (
- # for mairix compatibility
+# for mairix compatibility
+our $NON_QUOTED_BODY = 'XNQ XDFN XDFA XDFB XDFHH XDFCTX XDFPRE XDFPOST XDFID';
+our %COMMIT_PROB_COMMON = (
s => 'S',
- m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
- l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
f => 'A',
- t => 'XTO',
- tc => 'XTO XCC',
- c => 'XCC',
- tcf => 'XTO XCC A',
- a => 'XTO XCC A',
- b => $non_quoted_body . ' XQUOT',
- bs => $non_quoted_body . ' XQUOT S',
+ b => $NON_QUOTED_BODY . ' XQUOT',
+ bs => $NON_QUOTED_BODY . ' XQUOT S',
n => 'XFN',
q => 'XQUOT',
- nq => $non_quoted_body,
+ nq => $NON_QUOTED_BODY,
dfn => 'XDFN',
dfa => 'XDFA',
dfb => 'XDFB',
dfhh => 'XDFHH',
dfctx => 'XDFCTX',
+);
+my %prob_prefix = (
+ m => 'XM', # 'mid:' (bool) is exact, 'm:' (prob) can do partial
+ l => 'XL', # 'lid:' (bool) is exact, 'l:' (prob) can do partial
+ t => 'XTO',
+ tc => 'XTO XCC',
+ c => 'XCC',
+ tcf => 'XTO XCC A',
+ a => 'XTO XCC A',
+ %COMMIT_PROB_COMMON,
# default:
- '' => 'XM S A XQUOT XFN ' . $non_quoted_body,
+ '' => 'XM S A XQUOT XFN ' . $NON_QUOTED_BODY,
);
# not documenting m: and mid: for now, the using the URLs works w/o Xapian
diff --git a/lib/PublicInbox/SearchIdx.pm b/lib/PublicInbox/SearchIdx.pm
index 257b83a5..66f688de 100644
--- a/lib/PublicInbox/SearchIdx.pm
+++ b/lib/PublicInbox/SearchIdx.pm
@@ -134,6 +134,7 @@ sub idx_acquire {
load_xapian_writable();
$flag = $self->{creat} ? $DB_CREATE_OR_OPEN : $DB_OPEN;
}
+ my $owner = $self->{ibx} // $self->{eidx} // $self;
if ($self->{creat}) {
require File::Path;
$self->lock_acquire;
@@ -146,13 +147,11 @@ sub idx_acquire {
require PublicInbox::Syscall;
PublicInbox::Syscall::nodatacow_dir($dir);
$self->{-set_has_threadid_once} = 1;
- if (($self->{ibx} // $self->{eidx})->{-dangerous}) {
- $flag |= $DB_DANGEROUS;
- }
+ $flag |= $DB_DANGEROUS if $owner->{-dangerous};
}
}
return unless defined $flag;
- $flag |= $DB_NO_SYNC if ($self->{ibx} // $self->{eidx})->{-no_fsync};
+ $flag |= $DB_NO_SYNC if $owner->{-no_fsync};
my $xdb = eval { ($X->{WritableDatabase})->new($dir, $flag) };
croak "Failed opening $dir: $@" if $@;
$self->{xdb} = $xdb;
@@ -376,9 +375,9 @@ sub index_xapian { # msg_iter callback
$fh->autoflush(1);
print $fh $s or die "print: $!";
sysseek($fh, 0, SEEK_SET) or die "sysseek: $!";
- my $id = ($self->{ibx} // $self->{eidx})->git->qx(
- [qw(patch-id --stable)],
- {}, { 0 => $fh, 2 => $eh });
+ my $id = ($self->{git} // ($self->{ibx} // $self->{eidx})->git
+ )->qx([qw(patch-id --stable)], {},
+ { 0 => $fh, 2 => $eh });
$id =~ /\A([a-f0-9]{40,})/ and $doc->add_term('XDFID'.$1);
seek($eh, 0, SEEK_SET) or die "seek: $!";
while (<$eh>) { warn $_ }
@@ -414,14 +413,16 @@ sub index_list_id ($$$) {
sub index_ids ($$$$) {
my ($self, $doc, $hdr, $mids) = @_;
- for my $mid (@$mids) {
- index_phrase($self, $mid, 1, 'XM');
+ if (!$self->{-repo_idx}) {
+ for my $mid (@$mids) {
+ index_phrase($self, $mid, 1, 'XM');
- # because too many Message-IDs are prefixed with
- # "Pine.LNX."...
- if ($mid =~ /\w{12,}/) {
- my @long = ($mid =~ /(\w{3,}+)/g);
- index_phrase($self, join(' ', @long), 1, 'XM');
+ # because too many Message-IDs are prefixed with
+ # "Pine.LNX."...
+ if ($mid =~ /\w{12,}/) {
+ my @long = ($mid =~ /(\w{3,}+)/g);
+ index_phrase($self, join(' ', @long), 1, 'XM');
+ }
}
}
$doc->add_boolean_term('Q' . $_) for @$mids;
@@ -438,10 +439,11 @@ sub eml2doc ($$$;$) {
add_val($doc, PublicInbox::Search::YYYYMMDD(), $yyyymmdd);
my $dt = strftime('%Y%m%d%H%M%S', @ds);
add_val($doc, PublicInbox::Search::DT(), $dt);
- add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes});
- add_val($doc, PublicInbox::Search::UID(), $smsg->{num});
- add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid});
-
+ if (!$self->{-repo_idx}) {
+ add_val($doc, PublicInbox::Search::BYTES(), $smsg->{bytes});
+ add_val($doc, PublicInbox::Search::UID(), $smsg->{num});
+ add_val($doc, PublicInbox::Search::THREADID, $smsg->{tid});
+ }
my $tg = term_generator($self);
$tg->set_document($doc);
index_headers($self, $smsg);
diff --git a/script/public-inbox-cindex b/script/public-inbox-cindex
new file mode 100755
index 00000000..f6717239
--- /dev/null
+++ b/script/public-inbox-cindex
@@ -0,0 +1,70 @@
+#!perl -w
+# Copyright (C) all contributors <meta@public-inbox.org>
+# License: AGPL-3.0+ <https://www.gnu.org/licenses/agpl-3.0.txt>
+use v5.12;
+use Getopt::Long qw(:config gnu_getopt no_ignore_case auto_abbrev);
+my $help = <<EOF; # the following should fit w/o scrolling in 80x24 term:
+usage: public-inbox-cindex [options] CINDEX_DIR [GIT_DIR...]
+
+ Create and update detached search indices for coderepos
+
+ --no-fsync speed up indexing, risk corruption on power outage
+ -L LEVEL `medium', or `full' (default: medium)
+ --all index all configured repos
+ --jobs=NUM set or disable parallelization (NUM=0)
+ --batch-size=BYTES flush changes to OS after a given number of bytes
+ --max-size=BYTES do not index messages larger than the given size
+ --gc perform garbage collection instead of indexing
+ --reindex index previously indexed repos
+ --verbose | -v increase verbosity (may be repeated)
+
+BYTES may use `k', `m', and `g' suffixes (e.g. `10m' for 10 megabytes)
+See public-inbox-codeindex(1) man page for full documentation.
+EOF
+my $opt = { quiet => -1, compact => 0, fsync => 1, scan => 1 };
+GetOptions($opt, qw(verbose|v+ reindex rethread compact|c+ jobs|j=i
+ fsync|sync! dangerous
+ indexlevel|index-level|L=s max_size|max-size=s
+ batch_size|batch-size=s
+ dedupe:s@ gc commit-interval=i watch scan! dry-run|n
+ all C=s@ help|h))
+ or die $help;
+if ($opt->{help}) { print $help; exit 0 };
+die "--jobs must be >= 0\n" if defined $opt->{jobs} && $opt->{jobs} < 0;
+require IO::Handle;
+STDOUT->autoflush(1);
+STDERR->autoflush(1);
+local $SIG{USR1} = 'IGNORE'; # to be overridden in cidx_sync
+# require lazily to speed up --help
+require PublicInbox::Admin;
+PublicInbox::Admin::do_chdir(delete $opt->{C});
+my $cfg = PublicInbox::Config->new;
+my $cidx_dir = shift(@ARGV) // die "E: $help";
+if ($opt->{gc}) {
+ die "E: repository paths must not be specified with --gc\n" if @ARGV;
+ for my $sw (qw(watch dry-run)) {
+ die "E: --$sw is not compatible with --gc\n" if $opt->{$sw};
+ }
+}
+PublicInbox::Admin::require_or_die(qw(-search));
+PublicInbox::Admin::progress_prepare($opt);
+my $env = PublicInbox::Admin::index_prepare($opt, $cfg);
+local %ENV = (%ENV, %$env) if $env;
+require PublicInbox::CodeSearchIdx;
+my $cidx = PublicInbox::CodeSearchIdx->new($cidx_dir, $opt);
+if ($opt->{gc}) {
+ $cidx->attach_config($cfg);
+ $cidx->cidx_gc($opt);
+} else {
+ if ($opt->{config}) {
+ $cidx->attach_config($cfg);
+ } else {
+ $cidx->attach_config($cfg, \@ibxs);
+ }
+ if ($opt->{watch}) {
+ $cfg = undef; # save memory only after SIGHUP
+ $cidx->cidx_watch($opt);
+ } else {
+ $cidx->cidx_sync($opt);
+ }
+}
prev parent reply other threads:[~2023-02-17 4:01 UTC|newest]
Thread overview: 2+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-02-17 4:01 [PATCH 1/2] ipc: move nproc_shards from v2writable Eric Wong
2023-02-17 4:01 ` Eric Wong [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230217040113.111644-2-e@80x24.org \
--to=e@80x24.org \
--cc=spew@80x24.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).